Properly process Redis buffered messages and ingest into Kafka

2022-09-14 18:32:32 +01:00 · 2022-09-14 18:32:32 +01:00 · f432e9b29e
parent c5f01c3084
commit f432e9b29e
6 changed files with 190 additions and 133 deletions
--- a/db.py
+++ b/db.py
@ -1,15 +1,15 @@
+import random
 from math import ceil

 import aioredis
 import manticoresearch
 import ujson
+from aiokafka import AIOKafkaProducer
 from manticoresearch.rest import ApiException
 from numpy import array_split
 from redis import StrictRedis

 import util
-import random
-from aiokafka import AIOKafkaProducer

 # Manticore schema
 from schemas import mc_s
@ -21,6 +21,7 @@ api_instance = manticoresearch.IndexApi(api_client)

 # Kafka
 from aiokafka import AIOKafkaProducer
+
 KAFKA_TOPIC = "msg"

 log = util.get_logger("db")
@ -51,7 +52,7 @@ KEYPREFIX = "queue."

 async def store_kafka_batch(data):
    print("STORING KAFKA BATCH")
-    producer = AIOKafkaProducer(bootstrap_servers='kafka:9092')
+    producer = AIOKafkaProducer(bootstrap_servers="kafka:9092")
    await producer.start()
    batch = producer.create_batch()
    for msg in data:
@ -70,27 +71,33 @@ async def store_kafka_batch(data):
                del msg[key]
            if key in schema:
                if isinstance(value, int):
-                    if schema[key].startswith("string") or schema[key].startswith("text"):
+                    if schema[key].startswith("string") or schema[key].startswith(
+                        "text"
+                    ):
                        msg[key] = str(value)
        message = ujson.dumps(msg)
        body = str.encode(message)
+        if "ts" not in msg:
+            # print("MSG WITHOUT TS", msg)
+            continue
        metadata = batch.append(key=None, value=body, timestamp=msg["ts"])
        if metadata is None:
            partitions = await producer.partitions_for(KAFKA_TOPIC)
            partition = random.choice(tuple(partitions))
            await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
-            print("%d messages sent to partition %d"
-                  % (batch.record_count(), partition))
+            print(
+                "%d messages sent to partition %d" % (batch.record_count(), partition)
+            )
            batch = producer.create_batch()
            continue

    partitions = await producer.partitions_for(KAFKA_TOPIC)
    partition = random.choice(tuple(partitions))
    await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
-    print("%d messages sent to partition %d"
-          % (batch.record_count(), partition))
+    print("%d messages sent to partition %d" % (batch.record_count(), partition))
    await producer.stop()

+
 # def store_message(msg):
 #     """
 #     Store a message into Manticore
@ -132,6 +139,7 @@ async def store_kafka_batch(data):
 #     print("Exception when calling IndexApi->bulk: %s\n" % e)
 #     print("ATTEMPT", body_post)

+
 async def queue_message(msg):
    """
    Queue a message on the Redis buffer.
@ -139,9 +147,10 @@ async def queue_message(msg):
    src = msg["src"]
    message = ujson.dumps(msg)

-    key = "{KEYPREFIX}{src}"
+    key = f"{KEYPREFIX}{src}"
    await ar.sadd(key, message)

+
 async def queue_message_bulk(data):
    """
    Queue multiple messages on the Redis buffer.
@ -150,7 +159,7 @@ async def queue_message_bulk(data):
        src = msg["src"]
        message = ujson.dumps(msg)

-        key = "{KEYPREFIX}{src}"
+        key = f"{KEYPREFIX}{src}"
        await ar.sadd(key, message)


--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -19,7 +19,11 @@ services:
      - .env
    volumes_from:
      - tmp
-    # depends_on:
+    depends_on:
+      - broker
+      - kafka
+      - tmp
+      - redis
    #   - db

  threshold:
@ -52,12 +56,16 @@ services:
      - 9093:9090
    environment:
      - DRUID_BROKER_URL=http://broker:8082
+    depends_on:
+      - broker

  metabase:
    container_name: metabase
    image: metabase/metabase:latest
    ports:
      - 3001:3000
+    depends_on:
+      - broker

  postgres:
    container_name: postgres
@ -82,6 +90,7 @@ services:
    image: bitnami/kafka
    depends_on:
      - zookeeper
+      - broker
    ports:
      - 29092:29092
      - 9092:9092
--- a/monolith.py
+++ b/monolith.py
@ -1,11 +1,11 @@
 import asyncio
 from os import getenv

+import db
 import util
 from sources.ch4 import Chan4
 from sources.dis import DiscordClient
 from sources.ingest import Ingest
-import db

 # For development
 # if not getenv("DISCORD_TOKEN", None):
@ -27,7 +27,6 @@ async def main(loop):
    log.info("Starting Discord handler.")
    client = DiscordClient()
    loop.create_task(client.start(token))
-    # client.run(token)

    log.info("Starting 4chan handler.")
    chan = Chan4()
--- a/processing/process.py
+++ b/processing/process.py
@ -1,7 +1,20 @@
-from concurrent.futures import ProcessPoolExecutor
 import asyncio
 import os
+import random
+
+# For key generation
+import string
+from concurrent.futures import ProcessPoolExecutor
+
+# For timestamp processing
+from datetime import datetime
+from math import ceil
+
 import ujson
+
+# For 4chan message parsing
+from bs4 import BeautifulSoup
+from numpy import array_split
 from siphashc import siphash

 import db
@ -10,19 +23,6 @@ import util
 # 4chan schema
 from schemas.ch4_s import ATTRMAP

-# For key generation
-import string
-import random
-
-# For timestamp processing
-import datetime
-
-# For 4chan message parsing
-from bs4 import BeautifulSoup
-
-from numpy import array_split
-from math import ceil
-
 log = util.get_logger("process")

 # Maximum number of CPU threads to use for post processing
@ -30,6 +30,7 @@ CPU_THREADS = os.cpu_count()

 p = ProcessPoolExecutor(CPU_THREADS)

+
 def get_hash_key():
    hash_key = db.r.get("hashing_key")
    if not hash_key:
@ -42,31 +43,66 @@ def get_hash_key():
        log.debug(f"Decoded hash key: {hash_key}")
    return hash_key

+
 hash_key = get_hash_key()

+
+@asyncio.coroutine
 async def spawn_processing_threads(data):
-    print("SPAWN", data)
+    loop = asyncio.get_event_loop()
+    tasks = []
+    oldts = [x["now"] for x in data if "now" in x]
    if len(data) < CPU_THREADS:
        split_data = [data]
    else:
        msg_per_core = int(len(data) / CPU_THREADS)
        print("MSG PER CORE", msg_per_core)
        split_data = array_split(data, ceil(len(data) / msg_per_core))
-    print("SPLIT DATA", split_data)
-    for split in split_data:
+    for index, split in enumerate(split_data):
        print("DELEGATING TO THREAD", len(split))
-        await process_data_thread(split)
+        future = loop.run_in_executor(p, process_data, data)
+        # future = p.submit(process_data, split)
+        tasks.append(future)
+    # results = [x.result(timeout=50) for x in tasks]
+    results = await asyncio.gather(*tasks)
+    print("RESULTS", len(results))
+
+    # Join the results back from the split list
+    flat_list = [item for sublist in results for item in sublist]
+    print("LENFLAT", len(flat_list))
+    print("LENDATA", len(data))
+
+    newts = [x["ts"] for x in flat_list if "ts" in x]
+    print("lenoldts", len(oldts))
+    print("lennewts", len(newts))
+    allts = all(["ts" in x for x in flat_list])
+    print("ALLTS", allts)
+    alllen = [len(x) for x in flat_list]
+    print("ALLLEN", alllen)
+    await db.store_kafka_batch(flat_list)
+
+
+# @asyncio.coroutine
+# def process_data_thread(data):
+#     """
+#     Helper to spawn threads to process a list of data.
+#     """
+#     loop = asyncio.get_event_loop()
+#     if len(data) < CPU_THREADS:
+#         split_data = [data]
+#     else:
+#         msg_per_core = int(len(data) / CPU_THREADS)
+#         print("MSG PER CORE", msg_per_core)
+#         split_data = array_split(data, ceil(len(data) / msg_per_core))
+#     for index, split in enumerate(split_data):
+#         print("DELEGATING TO THREAD", len(split))
+#         #f = process_data_thread(split)
+#         yield loop.run_in_executor(p, process_data, data)

-@asyncio.coroutine
-def process_data_thread(data):
-    """
-    Helper to spawn threads to process a list of data.
-    """
-    loop = asyncio.get_event_loop()
-    yield from loop.run_in_executor(p, process_data, data)

 def process_data(data):
-    print("PROCESSING DATA", data)
+    print("PROCESS DATA START")
+    # to_store = []
    for index, msg in enumerate(data):
        # print("PROCESSING", msg)
        if msg["src"] == "4ch":
@ -81,15 +117,18 @@ def process_data(data):
            if key_content:
                key_content = key_content.decode("ascii")
                if key_content == hash:
+                    del data[index]
                    continue
                else:
-                    data[index][index]["type"] = "update"
+                    data[index]["type"] = "update"
            db.r.set(redis_key, hash)
-            for key2, value in list(msg.items()):
+            if "now" not in data[index]:
+                print("NOW NOT IN INDEX", data[index])
+            for key2, value in list(data[index].items()):
                if key2 in ATTRMAP:
-                    msg[ATTRMAP[key2]] = data[index][key2]
+                    data[index][ATTRMAP[key2]] = data[index][key2]
                    del data[index][key2]
-            if "ts" in msg:
+            if "ts" in data[index]:
                old_time = data[index]["ts"]
                # '08/30/22(Tue)02:25:37'
                time_spl = old_time.split(":")
@ -100,7 +139,13 @@ def process_data(data):
                # new_ts = old_ts.isoformat()
                new_ts = int(old_ts.timestamp())
                data[index]["ts"] = new_ts
+            else:
+                print("MSG WITHOUT TS PROCESS", data[index])
+                continue
            if "msg" in msg:
                soup = BeautifulSoup(data[index]["msg"], "html.parser")
                msg = soup.get_text(separator="\n")
                data[index]["msg"] = msg
+        # to_store.append(data[index])
+    print("FINISHED PROCESSING DATA")
+    return data
--- a/sources/ch4.py
+++ b/sources/ch4.py
@ -136,7 +136,7 @@ class Chan4(object):
        # Split into 10,000 chunks
        if not all_posts:
            return
-        self.handle_posts(all_posts)
+        await self.handle_posts(all_posts)
        # threads_per_core = int(len(all_posts) / CPU_THREADS)
        # for i in range(CPU_THREADS):
        #     new_dict = {}
@ -148,7 +148,6 @@ class Chan4(object):
        #             new_dict[k] = [v]
        # await self.handle_posts_thread(new_dict)

-
        # print("VAL", ceil(len(all_posts) / threads_per_core))
        # split_posts = array_split(all_posts, ceil(len(all_posts) / threads_per_core))
        # print("THREADS PER CORE SPLIT", len(split_posts))
--- a/sources/ingest.py
+++ b/sources/ingest.py
@ -4,24 +4,22 @@ import ujson

 import db
 import util
-
 from processing import process

-SOURCES = ["irc", "dis", "4ch"]
+SOURCES = ["4ch", "irc", "dis"]
 KEYPREFIX = "queue."
-CHUNK_SIZE = 1000
+CHUNK_SIZE = 90000
 ITER_DELAY = 0.5


-
-
-
 class Ingest(object):
    def __init__(self):
        name = self.__class__.__name__
        self.log = util.get_logger(name)

    async def run(self):
+        # items = [{'no': 23567753, 'now': '09/12/22(Mon)20:10:29', 'name': 'Anonysmous', 'filename': '1644986767568', 'ext': '.webm', 'w': 1280, 'h': 720, 'tn_w': 125, 'tn_h': 70, 'tim': 1663027829301457, 'time': 1663027829, 'md5': 'zeElr1VR05XpZ2XuAPhmPA==', 'fsize': 3843621, 'resto': 23554700, 'type': 'msg', 'src': '4ch', 'net': 'gif', 'channel': '23554700'}]
+        # await process.spawn_processing_threads(items)
        while True:
            await self.get_chunk()
            await asyncio.sleep(ITER_DELAY)
@ -41,5 +39,3 @@ class Ingest(object):
        if items:
            print("PROCESSING", len(items))
            await process.spawn_processing_threads(items)
-            print("DONE WITH PROCESSING", len(items))
-            await db.store_kafka_batch(items)