Split thread list into chunks to save memory

2022-09-05 07:20:30 +01:00 · 2022-09-05 07:20:30 +01:00 · f8fc5e1a1b
parent 6e00f70184
commit f8fc5e1a1b
3 changed files with 17 additions and 7 deletions
--- a/db.py
+++ b/db.py
@ -53,6 +53,8 @@ def store_message_bulk(data):
    :param msg: dict
    """
    print("BULK", len(data))
    if not data:
        return
    split_posts = array_split(data, ceil(len(data) / 10000))
    for messages in split_posts:
        print("PROCESSING SPLIT OF", len(messages), "MESSAGES")
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -18,6 +18,6 @@ COPY discord-patched.tgz /code/
 RUN python -m venv /venv
 RUN . /venv/bin/activate && pip install -r requirements.txt
-RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages
+RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.9/site-packages
 CMD . /venv/bin/activate && exec python monolith.py
--- a/sources/ch4.py
+++ b/sources/ch4.py
@ -14,6 +14,9 @@ import db
 import util
 from schemas.ch4_s import ATTRMAP
 from numpy import array_split
 from math import ceil
 p = ProcessPoolExecutor(10)
@ -75,7 +78,16 @@ class Chan4(object):
                    to_get.append((mapped, no))
            self.log.info(f"Got thread list for {mapped}: {len(response)}")
-        await self.get_threads_content(to_get)
+        print("THREAD LIST FULL LEN", len(to_get))
        if not to_get:
            await self.get_thread_lists(self.boards)
            return
        split_threads = array_split(to_get, ceil(len(to_get) / 10000))
        print("SPLIT THREADS INTO", len(split_threads))
        for threads in split_threads:
            print("SUBMITTED THREADS FOR", len(threads))
            await self.get_threads_content(threads)
        #await self.get_threads_content(to_get)
        # Recurse
        await self.get_thread_lists(self.boards)
@ -106,6 +118,7 @@ class Chan4(object):
        # await self.handle_posts(board, thread, response["posts"])
        # await asyncio.sleep(1)
        await self.handle_posts_thread(all_posts)
        # self.handle_posts(all_posts)
    @asyncio.coroutine
    def handle_posts_thread(self, posts):
@ -114,12 +127,9 @@ class Chan4(object):
        yield from loop.run_in_executor(p, self.handle_posts, posts)
    def handle_posts(self, posts):
        print("HANDLE POSTS START")
        to_store = []
        for key, post_list in posts.items():
            board, thread = key
            print("PROCESSING BOARD", board, "THREAD", thread)
            print("POSTS HERE", len(post_list))
            for index, post in enumerate(post_list):
                posts[key][index]["type"] = "msg"
@ -161,9 +171,7 @@ class Chan4(object):
            # print({name_map[name]: val for name, val in post.items()})
        # print(f"Got posts: {len(posts)}")
        print("HANDLE POSTS DONE")
        db.store_message_bulk(to_store)
        print("STORE DB DONE")
    async def fetch(self, url, session, mapped):
        async with session.get(url) as response: