Improve memory usage and fix 4chan crawler

2022-10-21 07:20:30 +01:00
parent 2d7b6268dd
commit 51a9b2af79
6 changed files with 87 additions and 48 deletions
--- a/sources/ch4.py
+++ b/sources/ch4.py
@@ -74,26 +74,28 @@ class Chan4(object):

    async def get_thread_lists(self, boards):
        # self.log.debug(f"Getting thread list for {boards}")
-        board_urls = {board: f"{board}/catalog.json" for board in boards}
+        board_urls = {board: f"{board}/threads.json" for board in boards}
        responses = await self.api_call(board_urls)
        to_get = []
        flat_map = [board for board, thread in responses]
-        self.log.debug(f"Got thread list for {flat_map}: {len(responses)}")
-        for mapped, response in responses:
+        self.log.debug(f"Got thread list for {len(responses)} boards: {flat_map}")
+        for board, response in responses:
            if not response:
                continue
            for page in response:
                for threads in page["threads"]:
                    no = threads["no"]
-                    to_get.append((mapped, no))
+                    to_get.append((board, no))

        if not to_get:
            return
+        self.log.debug(f"Got {len(to_get)} threads to fetch")
        split_threads = array_split(to_get, ceil(len(to_get) / THREADS_CONCURRENT))
-        for threads in split_threads:
-            await self.get_threads_content(threads)
+        self.log.debug(f"Split threads into {len(split_threads)} series")
+        for index, thr in enumerate(split_threads):
+            self.log.debug(f"Series {index} - getting {len(thr)} threads")
+            await self.get_threads_content(thr)
            await asyncio.sleep(THREADS_DELAY)
-        # await self.get_threads_content(to_get)

    def take_items(self, dict_list, n):
        i = 0
@@ -132,14 +134,14 @@ class Chan4(object):
        to_store = []
        for key, post_list in posts.items():
            board, thread = key
-            for index, post in enumerate(post_list):
-                posts[key][index]["type"] = "msg"
+            for post in post_list:
+                post["type"] = "msg"

-                posts[key][index]["src"] = "4ch"
-                posts[key][index]["net"] = board
-                posts[key][index]["channel"] = thread
+                post["src"] = "4ch"
+                post["net"] = board
+                post["channel"] = thread

-                to_store.append(posts[key][index])
+                to_store.append(post)

        if to_store:
            await db.queue_message_bulk(to_store)
--- a/sources/ingest.py
+++ b/sources/ingest.py
@@ -1,8 +1,6 @@
 import asyncio
 from os import getenv

-import orjson
-
 import db
 import util
 from processing import process
@@ -20,6 +18,7 @@ class Ingest(object):
    def __init__(self):
        name = self.__class__.__name__
        self.log = util.get_logger(name)
+        self.current_chunk = 0
        self.log.info(
            (
                "Starting ingest handler for chunk size of "
@@ -30,20 +29,14 @@ class Ingest(object):
    async def run(self):
        while True:
            await self.get_chunk()
+            self.log.debug(f"Ingest chunk {self.current_chunk} complete")
+            self.current_chunk += 1
            await asyncio.sleep(ITER_DELAY)

    async def get_chunk(self):
-        items = []
-        # for source in SOURCES:
-        # key = f"{KEYPREFIX}{source}"
        length = await db.ar.llen(KEYNAME)
-        start_num = length - CHUNK_SIZE
-        chunk = await db.ar.lrange(KEYNAME, start_num, -1)
-        # chunk = await db.ar.rpop(KEYNAME, CHUNK_SIZE)
-        if not chunk:
+        if length > CHUNK_SIZE:
+            length = CHUNK_SIZE
+        if not length:
            return
-        for item in chunk:
-            item = orjson.loads(item)
-            items.append(item)
-        if items:
-            await process.spawn_processing_threads(items)
+        await process.spawn_processing_threads(self.current_chunk, length)