monolith/processing/process.py

import asyncio
import os
import random

# For key generation
import string
from concurrent.futures import ProcessPoolExecutor

# For timestamp processing
from datetime import datetime
from math import ceil

import ujson

# For 4chan message parsing
from bs4 import BeautifulSoup
from numpy import array_split
from siphashc import siphash

import db
import util

# 4chan schema
from schemas.ch4_s import ATTRMAP

log = util.get_logger("process")

# Maximum number of CPU threads to use for post processing
CPU_THREADS = os.cpu_count()

p = ProcessPoolExecutor(CPU_THREADS)


def get_hash_key():
    hash_key = db.r.get("hashing_key")
    if not hash_key:
        letters = string.ascii_lowercase
        hash_key = "".join(random.choice(letters) for i in range(16))
        log.debug(f"Created new hash key: {hash_key}")
        db.r.set("hashing_key", hash_key)
    else:
        hash_key = hash_key.decode("ascii")
        log.debug(f"Decoded hash key: {hash_key}")
    return hash_key


hash_key = get_hash_key()


@asyncio.coroutine
async def spawn_processing_threads(data):
    loop = asyncio.get_event_loop()
    tasks = []
    oldts = [x["now"] for x in data if "now" in x]
    if len(data) < CPU_THREADS:
        split_data = [data]
    else:
        msg_per_core = int(len(data) / CPU_THREADS)
        print("MSG PER CORE", msg_per_core)
        split_data = array_split(data, ceil(len(data) / msg_per_core))
    for index, split in enumerate(split_data):
        print("DELEGATING TO THREAD", len(split))
        future = loop.run_in_executor(p, process_data, data)
        # future = p.submit(process_data, split)
        tasks.append(future)
    # results = [x.result(timeout=50) for x in tasks]
    results = await asyncio.gather(*tasks)
    print("RESULTS", len(results))

    # Join the results back from the split list
    flat_list = [item for sublist in results for item in sublist]
    print("LENFLAT", len(flat_list))
    print("LENDATA", len(data))

    newts = [x["ts"] for x in flat_list if "ts" in x]
    print("lenoldts", len(oldts))
    print("lennewts", len(newts))
    allts = all(["ts" in x for x in flat_list])
    print("ALLTS", allts)
    alllen = [len(x) for x in flat_list]
    print("ALLLEN", alllen)
    await db.store_kafka_batch(flat_list)


# @asyncio.coroutine
# def process_data_thread(data):
#     """
#     Helper to spawn threads to process a list of data.
#     """
#     loop = asyncio.get_event_loop()
#     if len(data) < CPU_THREADS:
#         split_data = [data]
#     else:
#         msg_per_core = int(len(data) / CPU_THREADS)
#         print("MSG PER CORE", msg_per_core)
#         split_data = array_split(data, ceil(len(data) / msg_per_core))
#     for index, split in enumerate(split_data):
#         print("DELEGATING TO THREAD", len(split))
#         #f = process_data_thread(split)
#         yield loop.run_in_executor(p, process_data, data)


def process_data(data):
    print("PROCESS DATA START")
    # to_store = []
    for index, msg in enumerate(data):
        # print("PROCESSING", msg)
        if msg["src"] == "4ch":
            board = msg["net"]
            thread = msg["channel"]
            # Calculate hash for post
            post_normalised = ujson.dumps(msg, sort_keys=True)
            hash = siphash(hash_key, post_normalised)
            hash = str(hash)
            redis_key = f"cache.{board}.{thread}.{msg['no']}"
            key_content = db.r.get(redis_key)
            if key_content:
                key_content = key_content.decode("ascii")
                if key_content == hash:
                    del data[index]
                    continue
                else:
                    data[index]["type"] = "update"
            db.r.set(redis_key, hash)
            if "now" not in data[index]:
                print("NOW NOT IN INDEX", data[index])
            for key2, value in list(data[index].items()):
                if key2 in ATTRMAP:
                    data[index][ATTRMAP[key2]] = data[index][key2]
                    del data[index][key2]
            if "ts" in data[index]:
                old_time = data[index]["ts"]
                # '08/30/22(Tue)02:25:37'
                time_spl = old_time.split(":")
                if len(time_spl) == 3:
                    old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
                else:
                    old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
                # new_ts = old_ts.isoformat()
                new_ts = int(old_ts.timestamp())
                data[index]["ts"] = new_ts
            else:
                print("MSG WITHOUT TS PROCESS", data[index])
                continue
            if "msg" in msg:
                soup = BeautifulSoup(data[index]["msg"], "html.parser")
                msg = soup.get_text(separator="\n")
                data[index]["msg"] = msg
        # to_store.append(data[index])
    print("FINISHED PROCESSING DATA")
    return data
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`import asyncio`
			`import os`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`import random`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
			`# For key generation`
			`import string`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`from concurrent.futures import ProcessPoolExecutor`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
			`# For timestamp processing`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`from datetime import datetime`
			`from math import ceil`

			`import ujson`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
			`# For 4chan message parsing`
			`from bs4 import BeautifulSoup`
			`from numpy import array_split`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`from siphashc import siphash`

			`import db`
			`import util`

			`# 4chan schema`
			`from schemas.ch4_s import ATTRMAP`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
			`log = util.get_logger("process")`

			`# Maximum number of CPU threads to use for post processing`
			`CPU_THREADS = os.cpu_count()`

			`p = ProcessPoolExecutor(CPU_THREADS)`

Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`def get_hash_key():`
			`hash_key = db.r.get("hashing_key")`
			`if not hash_key:`
			`letters = string.ascii_lowercase`
			`hash_key = "".join(random.choice(letters) for i in range(16))`
			`log.debug(f"Created new hash key: {hash_key}")`
			`db.r.set("hashing_key", hash_key)`
			`else:`
			`hash_key = hash_key.decode("ascii")`
			`log.debug(f"Decoded hash key: {hash_key}")`
			`return hash_key`

Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`hash_key = get_hash_key()`

Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
			`@asyncio.coroutine`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`async def spawn_processing_threads(data):`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`loop = asyncio.get_event_loop()`
			`tasks = []`
			`oldts = [x["now"] for x in data if "now" in x]`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`if len(data) < CPU_THREADS:`
			`split_data = [data]`
			`else:`
			`msg_per_core = int(len(data) / CPU_THREADS)`
			`print("MSG PER CORE", msg_per_core)`
			`split_data = array_split(data, ceil(len(data) / msg_per_core))`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`for index, split in enumerate(split_data):`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`print("DELEGATING TO THREAD", len(split))`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`future = loop.run_in_executor(p, process_data, data)`
			`# future = p.submit(process_data, split)`
			`tasks.append(future)`
			`# results = [x.result(timeout=50) for x in tasks]`
			`results = await asyncio.gather(*tasks)`
			`print("RESULTS", len(results))`

			`# Join the results back from the split list`
			`flat_list = [item for sublist in results for item in sublist]`
			`print("LENFLAT", len(flat_list))`
			`print("LENDATA", len(data))`

			`newts = [x["ts"] for x in flat_list if "ts" in x]`
			`print("lenoldts", len(oldts))`
			`print("lennewts", len(newts))`
			`allts = all(["ts" in x for x in flat_list])`
			`print("ALLTS", allts)`
			`alllen = [len(x) for x in flat_list]`
			`print("ALLLEN", alllen)`
			`await db.store_kafka_batch(flat_list)`


			`# @asyncio.coroutine`
			`# def process_data_thread(data):`
			`# """`
			`# Helper to spawn threads to process a list of data.`
			`# """`
			`# loop = asyncio.get_event_loop()`
			`# if len(data) < CPU_THREADS:`
			`# split_data = [data]`
			`# else:`
			`# msg_per_core = int(len(data) / CPU_THREADS)`
			`# print("MSG PER CORE", msg_per_core)`
			`# split_data = array_split(data, ceil(len(data) / msg_per_core))`
			`# for index, split in enumerate(split_data):`
			`# print("DELEGATING TO THREAD", len(split))`
			`# #f = process_data_thread(split)`
			`# yield loop.run_in_executor(p, process_data, data)`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00

			`def process_data(data):`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`print("PROCESS DATA START")`
			`# to_store = []`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`for index, msg in enumerate(data):`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`# print("PROCESSING", msg)`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`if msg["src"] == "4ch":`
			`board = msg["net"]`
			`thread = msg["channel"]`
			`# Calculate hash for post`
			`post_normalised = ujson.dumps(msg, sort_keys=True)`
			`hash = siphash(hash_key, post_normalised)`
			`hash = str(hash)`
			`redis_key = f"cache.{board}.{thread}.{msg['no']}"`
			`key_content = db.r.get(redis_key)`
			`if key_content:`
			`key_content = key_content.decode("ascii")`
			`if key_content == hash:`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`del data[index]`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`continue`
			`else:`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`data[index]["type"] = "update"`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`db.r.set(redis_key, hash)`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`if "now" not in data[index]:`
			`print("NOW NOT IN INDEX", data[index])`
			`for key2, value in list(data[index].items()):`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`if key2 in ATTRMAP:`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`data[index][ATTRMAP[key2]] = data[index][key2]`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`del data[index][key2]`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`if "ts" in data[index]:`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`old_time = data[index]["ts"]`
			`# '08/30/22(Tue)02:25:37'`
			`time_spl = old_time.split(":")`
			`if len(time_spl) == 3:`
			`old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")`
			`else:`
			`old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")`
			`# new_ts = old_ts.isoformat()`
			`new_ts = int(old_ts.timestamp())`
			`data[index]["ts"] = new_ts`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`else:`
			`print("MSG WITHOUT TS PROCESS", data[index])`
			`continue`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`if "msg" in msg:`
			`soup = BeautifulSoup(data[index]["msg"], "html.parser")`
			`msg = soup.get_text(separator="\n")`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`data[index]["msg"] = msg`
			`# to_store.append(data[index])`
			`print("FINISHED PROCESSING DATA")`
			`return data`