Reformat and set the net and channel for 4chan

This commit is contained in:
Mark Veidemanis 2022-09-05 07:20:30 +01:00
parent dcd648e1d2
commit 9c9d49dcd2
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
3 changed files with 14 additions and 13 deletions

8
db.py
View File

@ -41,8 +41,8 @@ def store_message(msg):
# print(body_post) # print(body_post)
try: try:
# Bulk index operations # Bulk index operations
api_instance.bulk(body_post, async_req=True) api_response = api_instance.bulk(body_post) # , async_req=True
# print(api_response) print(api_response)
except ApiException as e: except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e) print("Exception when calling IndexApi->bulk: %s\n" % e)
@ -82,8 +82,8 @@ def store_message_bulk(data):
# print(body_post) # print(body_post)
try: try:
# Bulk index operations # Bulk index operations
api_instance.bulk(body_post, async_req=True) api_response = api_instance.bulk(body_post) # , async_req=True
# print(api_response) print(api_response)
except ApiException as e: except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e) print("Exception when calling IndexApi->bulk: %s\n" % e)
print("FINISHED PROCESSING SPLIT") print("FINISHED PROCESSING SPLIT")

View File

@ -4,21 +4,18 @@ import random
import string import string
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from datetime import datetime from datetime import datetime
from math import ceil
import aiohttp import aiohttp
import ujson import ujson
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from numpy import array_split
from siphashc import siphash from siphashc import siphash
import db import db
import util import util
from schemas.ch4_s import ATTRMAP from schemas.ch4_s import ATTRMAP
from numpy import array_split
from math import ceil
# CONFIGURATION # # CONFIGURATION #
# Number of 4chan threads to request at once # Number of 4chan threads to request at once
@ -40,6 +37,7 @@ CPU_THREADS = 2
p = ProcessPoolExecutor(CPU_THREADS) p = ProcessPoolExecutor(CPU_THREADS)
class Chan4(object): class Chan4(object):
""" """
4chan indexer, crawler and ingester. 4chan indexer, crawler and ingester.
@ -119,7 +117,7 @@ class Chan4(object):
i += 1 i += 1
if i == n: if i == n:
raise StopIteration raise StopIteration
except: except StopIteration:
print("Take items took", i, "items") print("Take items took", i, "items")
async def get_threads_content(self, thread_list): async def get_threads_content(self, thread_list):
@ -213,6 +211,9 @@ class Chan4(object):
posts[key][index]["msg"] = msg posts[key][index]["msg"] = msg
posts[key][index]["src"] = "4ch" posts[key][index]["src"] = "4ch"
posts[key][index]["net"] = board
posts[key][index]["channel"] = thread
to_store.append(posts[key][index]) to_store.append(posts[key][index])
# print({name_map[name]: val for name, val in post.items()}) # print({name_map[name]: val for name, val in post.items()})