Implement sentiment/NLP annotation and optimise processing

2022-09-16 17:09:49 +01:00
parent 4ea77ac543
commit 143f2a0bf0
11 changed files with 203 additions and 338 deletions
--- a/sources/ingest.py
+++ b/sources/ingest.py
@@ -1,6 +1,6 @@
 import asyncio

-import ujson
+import orjson

 import db
 import util
@@ -8,9 +8,13 @@ from processing import process

 SOURCES = ["4ch", "irc", "dis"]
 KEYPREFIX = "queue."
-CHUNK_SIZE = 90000
+
+# Chunk size per source (divide by len(SOURCES) for total)
+CHUNK_SIZE = 9000
 ITER_DELAY = 0.5

+log = util.get_logger("ingest")
+

 class Ingest(object):
    def __init__(self):
@@ -18,8 +22,6 @@ class Ingest(object):
        self.log = util.get_logger(name)

    async def run(self):
-        # items = [{'no': 23567753, 'now': '09/12/22(Mon)20:10:29', 'name': 'Anonysmous', 'filename': '1644986767568', 'ext': '.webm', 'w': 1280, 'h': 720, 'tn_w': 125, 'tn_h': 70, 'tim': 1663027829301457, 'time': 1663027829, 'md5': 'zeElr1VR05XpZ2XuAPhmPA==', 'fsize': 3843621, 'resto': 23554700, 'type': 'msg', 'src': '4ch', 'net': 'gif', 'channel': '23554700'}]
-        # await process.spawn_processing_threads(items)
        while True:
            await self.get_chunk()
            await asyncio.sleep(ITER_DELAY)
@@ -31,11 +33,8 @@ class Ingest(object):
            chunk = await db.ar.spop(key, CHUNK_SIZE)
            if not chunk:
                continue
-            # self.log.info(f"Got chunk: {chunk}")
            for item in chunk:
-                item = ujson.loads(item)
-                # self.log.info(f"Got item: {item}")
+                item = orjson.loads(item)
                items.append(item)
        if items:
-            print("PROCESSING", len(items))
            await process.spawn_processing_threads(items)