You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
3.3 KiB
Python

from concurrent.futures import ProcessPoolExecutor
import asyncio
import os
import ujson
from siphashc import siphash
import db
import util
# 4chan schema
from schemas.ch4_s import ATTRMAP
# For key generation
import string
import random
# For timestamp processing
import datetime
# For 4chan message parsing
from bs4 import BeautifulSoup
from numpy import array_split
from math import ceil
log = util.get_logger("process")
# Maximum number of CPU threads to use for post processing
CPU_THREADS = os.cpu_count()
p = ProcessPoolExecutor(CPU_THREADS)
def get_hash_key():
hash_key = db.r.get("hashing_key")
if not hash_key:
letters = string.ascii_lowercase
hash_key = "".join(random.choice(letters) for i in range(16))
log.debug(f"Created new hash key: {hash_key}")
db.r.set("hashing_key", hash_key)
else:
hash_key = hash_key.decode("ascii")
log.debug(f"Decoded hash key: {hash_key}")
return hash_key
hash_key = get_hash_key()
async def spawn_processing_threads(data):
print("SPAWN", data)
if len(data) < CPU_THREADS:
split_data = [data]
else:
msg_per_core = int(len(data) / CPU_THREADS)
print("MSG PER CORE", msg_per_core)
split_data = array_split(data, ceil(len(data) / msg_per_core))
print("SPLIT DATA", split_data)
for split in split_data:
print("DELEGATING TO THREAD", len(split))
await process_data_thread(split)
@asyncio.coroutine
def process_data_thread(data):
"""
Helper to spawn threads to process a list of data.
"""
loop = asyncio.get_event_loop()
yield from loop.run_in_executor(p, process_data, data)
def process_data(data):
print("PROCESSING DATA", data)
for index, msg in enumerate(data):
#print("PROCESSING", msg)
if msg["src"] == "4ch":
board = msg["net"]
thread = msg["channel"]
# Calculate hash for post
post_normalised = ujson.dumps(msg, sort_keys=True)
hash = siphash(hash_key, post_normalised)
hash = str(hash)
redis_key = f"cache.{board}.{thread}.{msg['no']}"
key_content = db.r.get(redis_key)
if key_content:
key_content = key_content.decode("ascii")
if key_content == hash:
continue
else:
data[index][index]["type"] = "update"
db.r.set(redis_key, hash)
for key2, value in list(msg.items()):
if key2 in ATTRMAP:
msg[ATTRMAP[key2]] = data[index][key2]
del data[index][key2]
if "ts" in msg:
old_time = data[index]["ts"]
# '08/30/22(Tue)02:25:37'
time_spl = old_time.split(":")
if len(time_spl) == 3:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
else:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
# new_ts = old_ts.isoformat()
new_ts = int(old_ts.timestamp())
data[index]["ts"] = new_ts
if "msg" in msg:
soup = BeautifulSoup(data[index]["msg"], "html.parser")
msg = soup.get_text(separator="\n")
data[index]["msg"] = msg