Properly process Redis buffered messages and ingest into Kafka

2022-09-14 18:32:32 +01:00 · 2022-09-14 18:32:32 +01:00 · 4ea77ac543
parent fec0d379a6
commit 4ea77ac543
6 changed files with 190 additions and 133 deletions
--- a/db.py
+++ b/db.py
@ -1,15 +1,15 @@
 import random
 from math import ceil
 import aioredis
 import manticoresearch
 import ujson
 from aiokafka import AIOKafkaProducer
 from manticoresearch.rest import ApiException
 from numpy import array_split
 from redis import StrictRedis
 import util
 import random
 from aiokafka import AIOKafkaProducer
 # Manticore schema
 from schemas import mc_s
@ -21,6 +21,7 @@ api_instance = manticoresearch.IndexApi(api_client)
 # Kafka
 from aiokafka import AIOKafkaProducer
 KAFKA_TOPIC = "msg"
 log = util.get_logger("db")
@ -51,7 +52,7 @@ KEYPREFIX = "queue."
 async def store_kafka_batch(data):
    print("STORING KAFKA BATCH")
-    producer = AIOKafkaProducer(bootstrap_servers='kafka:9092')
+    producer = AIOKafkaProducer(bootstrap_servers="kafka:9092")
    await producer.start()
    batch = producer.create_batch()
    for msg in data:
@ -70,67 +71,74 @@ async def store_kafka_batch(data):
                del msg[key]
            if key in schema:
                if isinstance(value, int):
-                    if schema[key].startswith("string") or schema[key].startswith("text"):
+                    if schema[key].startswith("string") or schema[key].startswith(
                        "text"
                    ):
                        msg[key] = str(value)
        message = ujson.dumps(msg)
        body = str.encode(message)
        if "ts" not in msg:
            # print("MSG WITHOUT TS", msg)
            continue
        metadata = batch.append(key=None, value=body, timestamp=msg["ts"])
        if metadata is None:
            partitions = await producer.partitions_for(KAFKA_TOPIC)
            partition = random.choice(tuple(partitions))
            await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
-            print("%d messages sent to partition %d"
+            print(
-                  % (batch.record_count(), partition))
+                "%d messages sent to partition %d" % (batch.record_count(), partition)
            )
            batch = producer.create_batch()
            continue
    partitions = await producer.partitions_for(KAFKA_TOPIC)
    partition = random.choice(tuple(partitions))
    await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
-    print("%d messages sent to partition %d"
+    print("%d messages sent to partition %d" % (batch.record_count(), partition))
          % (batch.record_count(), partition))
    await producer.stop()
 # def store_message(msg):
 #     """
 #     Store a message into Manticore
 #     :param msg: dict
 #     """
 #     store_kafka(msg)
-    # # Duplicated to avoid extra function call
+# # Duplicated to avoid extra function call
-    # if msg["type"] in TYPES_MAIN:
+# if msg["type"] in TYPES_MAIN:
-    #     index = "main"
+#     index = "main"
-    #     schema = mc_s.schema_main
+#     schema = mc_s.schema_main
-    # elif msg["type"] in TYPES_META:
+# elif msg["type"] in TYPES_META:
-    #     index = "meta"
+#     index = "meta"
-    #     schema = mc_s.schema_meta
+#     schema = mc_s.schema_meta
-    # elif msg["type"] in TYPES_INT:
+# elif msg["type"] in TYPES_INT:
-    #     index = "internal"
+#     index = "internal"
-    #     schema = mc_s.schema_int
+#     schema = mc_s.schema_int
-    # # normalise fields
+# # normalise fields
-    # for key, value in list(msg.items()):
+# for key, value in list(msg.items()):
-    #     if value is None:
+#     if value is None:
-    #         del msg[key]
+#         del msg[key]
-    #     if key in schema:
+#     if key in schema:
-    #         if isinstance(value, int):
+#         if isinstance(value, int):
-    #             if schema[key].startswith("string") or schema[key].startswith("text"):
+#             if schema[key].startswith("string") or schema[key].startswith("text"):
-    #                 msg[key] = str(value)
+#                 msg[key] = str(value)
-    # body = [{"insert": {"index": index, "doc": msg}}]
+# body = [{"insert": {"index": index, "doc": msg}}]
-    # body_post = ""
+# body_post = ""
-    # for item in body:
+# for item in body:
-    #     body_post += ujson.dumps(item)
+#     body_post += ujson.dumps(item)
-    #     body_post += "\n"
+#     body_post += "\n"
 # # print(body_post)
 # try:
 #     # Bulk index operations
 #     print("FAKE POST")
 #     #api_response = api_instance.bulk(body_post)  # , async_req=True
 #     # print(api_response)
 # except ApiException as e:
 #     print("Exception when calling IndexApi->bulk: %s\n" % e)
 #     print("ATTEMPT", body_post)
    # # print(body_post)
    # try:
    #     # Bulk index operations
    #     print("FAKE POST")
    #     #api_response = api_instance.bulk(body_post)  # , async_req=True
    #     # print(api_response)
    # except ApiException as e:
    #     print("Exception when calling IndexApi->bulk: %s\n" % e)
    #     print("ATTEMPT", body_post)
 async def queue_message(msg):
    """
@ -139,9 +147,10 @@ async def queue_message(msg):
    src = msg["src"]
    message = ujson.dumps(msg)
-    key = "{KEYPREFIX}{src}"
+    key = f"{KEYPREFIX}{src}"
    await ar.sadd(key, message)
 async def queue_message_bulk(data):
    """
    Queue multiple messages on the Redis buffer.
@ -150,7 +159,7 @@ async def queue_message_bulk(data):
        src = msg["src"]
        message = ujson.dumps(msg)
-        key = "{KEYPREFIX}{src}"
+        key = f"{KEYPREFIX}{src}"
        await ar.sadd(key, message)
@ -176,50 +185,50 @@ def queue_message_bulk_sync(data):
 #         return
 #     for msg in data:
 #         store_kafka(msg)
-    # # 10000: maximum inserts we can submit to
+# # 10000: maximum inserts we can submit to
-    # # Manticore as of Sept 2022
+# # Manticore as of Sept 2022
-    # split_posts = array_split(data, ceil(len(data) / 10000))
+# split_posts = array_split(data, ceil(len(data) / 10000))
-    # for messages in split_posts:
+# for messages in split_posts:
-    #     total = []
+#     total = []
-    #     for msg in messages:
+#     for msg in messages:
-    #         # Duplicated to avoid extra function call (see above)
+#         # Duplicated to avoid extra function call (see above)
-    #         if msg["type"] in TYPES_MAIN:
+#         if msg["type"] in TYPES_MAIN:
-    #             index = "main"
+#             index = "main"
-    #             schema = mc_s.schema_main
+#             schema = mc_s.schema_main
-    #         elif msg["type"] in TYPES_META:
+#         elif msg["type"] in TYPES_META:
-    #             index = "meta"
+#             index = "meta"
-    #             schema = mc_s.schema_meta
+#             schema = mc_s.schema_meta
-    #         elif msg["type"] in TYPES_INT:
+#         elif msg["type"] in TYPES_INT:
-    #             index = "internal"
+#             index = "internal"
-    #             schema = mc_s.schema_int
+#             schema = mc_s.schema_int
-    #         # normalise fields
+#         # normalise fields
-    #         for key, value in list(msg.items()):
+#         for key, value in list(msg.items()):
-    #             if value is None:
+#             if value is None:
-    #                 del msg[key]
+#                 del msg[key]
-    #             if key in schema:
+#             if key in schema:
-    #                 if isinstance(value, int):
+#                 if isinstance(value, int):
-    #                     if schema[key].startswith("string") or schema[key].startswith(
+#                     if schema[key].startswith("string") or schema[key].startswith(
-    #                         "text"
+#                         "text"
-    #                     ):
+#                     ):
-    #                         msg[key] = str(value)
+#                         msg[key] = str(value)
-    #         body = {"insert": {"index": index, "doc": msg}}
+#         body = {"insert": {"index": index, "doc": msg}}
-    #         total.append(body)
+#         total.append(body)
-    #     body_post = ""
+#     body_post = ""
-    #     for item in total:
+#     for item in total:
-    #         body_post += ujson.dumps(item)
+#         body_post += ujson.dumps(item)
-    #         body_post += "\n"
+#         body_post += "\n"
-    #     # print(body_post)
+#     # print(body_post)
-    #     try:
+#     try:
-    #         # Bulk index operations
+#         # Bulk index operations
-    #         print("FAKE POST")
+#         print("FAKE POST")
-    #         #api_response = api_instance.bulk(body_post)  # , async_req=True
+#         #api_response = api_instance.bulk(body_post)  # , async_req=True
-    #         #print(api_response)
+#         #print(api_response)
-    #     except ApiException as e:
+#     except ApiException as e:
-    #         print("Exception when calling IndexApi->bulk: %s\n" % e)
+#         print("Exception when calling IndexApi->bulk: %s\n" % e)
-    #         print("ATTEMPT", body_post)
+#         print("ATTEMPT", body_post)
 # def update_schema():
@ -243,5 +252,5 @@ def queue_message_bulk_sync(data):
 #         util_instance.sql(create_query)
-#create_index(api_client)
+# create_index(api_client)
-#update_schema()
+# update_schema()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -19,7 +19,11 @@ services:
      - .env
    volumes_from:
      - tmp
-    # depends_on:
+    depends_on:
      - broker
      - kafka
      - tmp
      - redis
    #   - db
  threshold:
@ -52,12 +56,16 @@ services:
      - 9093:9090
    environment:
      - DRUID_BROKER_URL=http://broker:8082
    depends_on:
      - broker
  metabase:
    container_name: metabase
    image: metabase/metabase:latest
    ports:
      - 3001:3000
    depends_on:
      - broker
  postgres:
    container_name: postgres
@ -82,6 +90,7 @@ services:
    image: bitnami/kafka
    depends_on:
      - zookeeper
      - broker
    ports:
      - 29092:29092
      - 9092:9092
--- a/monolith.py
+++ b/monolith.py
@ -1,11 +1,11 @@
 import asyncio
 from os import getenv
 import db
 import util
 from sources.ch4 import Chan4
 from sources.dis import DiscordClient
 from sources.ingest import Ingest
 import db
 # For development
 # if not getenv("DISCORD_TOKEN", None):
@ -27,7 +27,6 @@ async def main(loop):
    log.info("Starting Discord handler.")
    client = DiscordClient()
    loop.create_task(client.start(token))
    # client.run(token)
    log.info("Starting 4chan handler.")
    chan = Chan4()
--- a/processing/process.py
+++ b/processing/process.py
@ -1,7 +1,20 @@
 from concurrent.futures import ProcessPoolExecutor
 import asyncio
 import os
 import random
 # For key generation
 import string
 from concurrent.futures import ProcessPoolExecutor
 # For timestamp processing
 from datetime import datetime
 from math import ceil
 import ujson
 # For 4chan message parsing
 from bs4 import BeautifulSoup
 from numpy import array_split
 from siphashc import siphash
 import db
@ -10,19 +23,6 @@ import util
 # 4chan schema
 from schemas.ch4_s import ATTRMAP
 # For key generation
 import string
 import random
 # For timestamp processing
 import datetime
 # For 4chan message parsing
 from bs4 import BeautifulSoup
 from numpy import array_split
 from math import ceil
 log = util.get_logger("process")
 # Maximum number of CPU threads to use for post processing
@ -30,6 +30,7 @@ CPU_THREADS = os.cpu_count()
 p = ProcessPoolExecutor(CPU_THREADS)
 def get_hash_key():
    hash_key = db.r.get("hashing_key")
    if not hash_key:
@ -42,33 +43,68 @@ def get_hash_key():
        log.debug(f"Decoded hash key: {hash_key}")
    return hash_key
 hash_key = get_hash_key()
@asyncio.coroutine
 async def spawn_processing_threads(data):
-    print("SPAWN", data)
+    loop = asyncio.get_event_loop()
    tasks = []
    oldts = [x["now"] for x in data if "now" in x]
    if len(data) < CPU_THREADS:
        split_data = [data]
    else:
        msg_per_core = int(len(data) / CPU_THREADS)
        print("MSG PER CORE", msg_per_core)
        split_data = array_split(data, ceil(len(data) / msg_per_core))
-    print("SPLIT DATA", split_data)
+    for index, split in enumerate(split_data):
    for split in split_data:
        print("DELEGATING TO THREAD", len(split))
-        await process_data_thread(split)
+        future = loop.run_in_executor(p, process_data, data)
        # future = p.submit(process_data, split)
        tasks.append(future)
    # results = [x.result(timeout=50) for x in tasks]
    results = await asyncio.gather(*tasks)
    print("RESULTS", len(results))
    # Join the results back from the split list
    flat_list = [item for sublist in results for item in sublist]
    print("LENFLAT", len(flat_list))
    print("LENDATA", len(data))
    newts = [x["ts"] for x in flat_list if "ts" in x]
    print("lenoldts", len(oldts))
    print("lennewts", len(newts))
    allts = all(["ts" in x for x in flat_list])
    print("ALLTS", allts)
    alllen = [len(x) for x in flat_list]
    print("ALLLEN", alllen)
    await db.store_kafka_batch(flat_list)
 # @asyncio.coroutine
 # def process_data_thread(data):
 #     """
 #     Helper to spawn threads to process a list of data.
 #     """
 #     loop = asyncio.get_event_loop()
 #     if len(data) < CPU_THREADS:
 #         split_data = [data]
 #     else:
 #         msg_per_core = int(len(data) / CPU_THREADS)
 #         print("MSG PER CORE", msg_per_core)
 #         split_data = array_split(data, ceil(len(data) / msg_per_core))
 #     for index, split in enumerate(split_data):
 #         print("DELEGATING TO THREAD", len(split))
 #         #f = process_data_thread(split)
 #         yield loop.run_in_executor(p, process_data, data)
@asyncio.coroutine
 def process_data_thread(data):
    """
    Helper to spawn threads to process a list of data.
    """
    loop = asyncio.get_event_loop()
    yield from loop.run_in_executor(p, process_data, data)
 def process_data(data):
-    print("PROCESSING DATA", data)
+    print("PROCESS DATA START")
    # to_store = []
    for index, msg in enumerate(data):
-        #print("PROCESSING", msg)
+        # print("PROCESSING", msg)
        if msg["src"] == "4ch":
            board = msg["net"]
            thread = msg["channel"]
@ -81,15 +117,18 @@ def process_data(data):
            if key_content:
                key_content = key_content.decode("ascii")
                if key_content == hash:
                    del data[index]
                    continue
                else:
-                    data[index][index]["type"] = "update"
+                    data[index]["type"] = "update"
            db.r.set(redis_key, hash)
-            for key2, value in list(msg.items()):
+            if "now" not in data[index]:
                print("NOW NOT IN INDEX", data[index])
            for key2, value in list(data[index].items()):
                if key2 in ATTRMAP:
-                    msg[ATTRMAP[key2]] = data[index][key2]
+                    data[index][ATTRMAP[key2]] = data[index][key2]
                    del data[index][key2]
-            if "ts" in msg:
+            if "ts" in data[index]:
                old_time = data[index]["ts"]
                # '08/30/22(Tue)02:25:37'
                time_spl = old_time.split(":")
@ -100,7 +139,13 @@ def process_data(data):
                # new_ts = old_ts.isoformat()
                new_ts = int(old_ts.timestamp())
                data[index]["ts"] = new_ts
            else:
                print("MSG WITHOUT TS PROCESS", data[index])
                continue
            if "msg" in msg:
                soup = BeautifulSoup(data[index]["msg"], "html.parser")
                msg = soup.get_text(separator="\n")
                data[index]["msg"] = msg
        # to_store.append(data[index])
    print("FINISHED PROCESSING DATA")
    return data
--- a/sources/ch4.py
+++ b/sources/ch4.py
@ -136,7 +136,7 @@ class Chan4(object):
        # Split into 10,000 chunks
        if not all_posts:
            return
-        self.handle_posts(all_posts)
+        await self.handle_posts(all_posts)
        # threads_per_core = int(len(all_posts) / CPU_THREADS)
        # for i in range(CPU_THREADS):
        #     new_dict = {}
@ -146,8 +146,7 @@ class Chan4(object):
        #             new_dict[k].append(v)
        #         else:
        #             new_dict[k] = [v]
-            #await self.handle_posts_thread(new_dict)
+        # await self.handle_posts_thread(new_dict)
        # print("VAL", ceil(len(all_posts) / threads_per_core))
        # split_posts = array_split(all_posts, ceil(len(all_posts) / threads_per_core))
--- a/sources/ingest.py
+++ b/sources/ingest.py
@ -4,24 +4,22 @@ import ujson
 import db
 import util
 from processing import process
-SOURCES = ["irc", "dis", "4ch"]
+SOURCES = ["4ch", "irc", "dis"]
 KEYPREFIX = "queue."
-CHUNK_SIZE = 1000
+CHUNK_SIZE = 90000
 ITER_DELAY = 0.5
 class Ingest(object):
    def __init__(self):
        name = self.__class__.__name__
        self.log = util.get_logger(name)
    async def run(self):
        # items = [{'no': 23567753, 'now': '09/12/22(Mon)20:10:29', 'name': 'Anonysmous', 'filename': '1644986767568', 'ext': '.webm', 'w': 1280, 'h': 720, 'tn_w': 125, 'tn_h': 70, 'tim': 1663027829301457, 'time': 1663027829, 'md5': 'zeElr1VR05XpZ2XuAPhmPA==', 'fsize': 3843621, 'resto': 23554700, 'type': 'msg', 'src': '4ch', 'net': 'gif', 'channel': '23554700'}]
        # await process.spawn_processing_threads(items)
        while True:
            await self.get_chunk()
            await asyncio.sleep(ITER_DELAY)
@ -33,13 +31,11 @@ class Ingest(object):
            chunk = await db.ar.spop(key, CHUNK_SIZE)
            if not chunk:
                continue
-            #self.log.info(f"Got chunk: {chunk}")
+            # self.log.info(f"Got chunk: {chunk}")
            for item in chunk:
                item = ujson.loads(item)
-                #self.log.info(f"Got item: {item}")
+                # self.log.info(f"Got item: {item}")
                items.append(item)
        if items:
            print("PROCESSING", len(items))
            await process.spawn_processing_threads(items)
            print("DONE WITH PROCESSING", len(items))
            await db.store_kafka_batch(items)