Improve memory usage and fix 4chan crawler

2022-10-21 07:20:30 +01:00
parent 2d7b6268dd
commit 51a9b2af79
6 changed files with 87 additions and 48 deletions
--- a/sources/ingest.py
+++ b/sources/ingest.py
@@ -1,8 +1,6 @@
 import asyncio
 from os import getenv

-import orjson
-
 import db
 import util
 from processing import process
@@ -20,6 +18,7 @@ class Ingest(object):
    def __init__(self):
        name = self.__class__.__name__
        self.log = util.get_logger(name)
+        self.current_chunk = 0
        self.log.info(
            (
                "Starting ingest handler for chunk size of "
@@ -30,20 +29,14 @@ class Ingest(object):
    async def run(self):
        while True:
            await self.get_chunk()
+            self.log.debug(f"Ingest chunk {self.current_chunk} complete")
+            self.current_chunk += 1
            await asyncio.sleep(ITER_DELAY)

    async def get_chunk(self):
-        items = []
-        # for source in SOURCES:
-        # key = f"{KEYPREFIX}{source}"
        length = await db.ar.llen(KEYNAME)
-        start_num = length - CHUNK_SIZE
-        chunk = await db.ar.lrange(KEYNAME, start_num, -1)
-        # chunk = await db.ar.rpop(KEYNAME, CHUNK_SIZE)
-        if not chunk:
+        if length > CHUNK_SIZE:
+            length = CHUNK_SIZE
+        if not length:
            return
-        for item in chunk:
-            item = orjson.loads(item)
-            items.append(item)
-        if items:
-            await process.spawn_processing_threads(items)
+        await process.spawn_processing_threads(self.current_chunk, length)