From 318a8ddbd57203dc68b647ac8dd9063b58034f77 Mon Sep 17 00:00:00 2001 From: Mark Veidemanis Date: Mon, 5 Sep 2022 07:20:30 +0100 Subject: [PATCH] Split thread list into chunks to save memory --- db.py | 2 ++ docker/Dockerfile | 2 +- sources/ch4.py | 20 ++++++++++++++------ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/db.py b/db.py index 97c615c..45ae7c7 100644 --- a/db.py +++ b/db.py @@ -53,6 +53,8 @@ def store_message_bulk(data): :param msg: dict """ print("BULK", len(data)) + if not data: + return split_posts = array_split(data, ceil(len(data) / 10000)) for messages in split_posts: print("PROCESSING SPLIT OF", len(messages), "MESSAGES") diff --git a/docker/Dockerfile b/docker/Dockerfile index c133ace..94da659 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,6 +18,6 @@ COPY discord-patched.tgz /code/ RUN python -m venv /venv RUN . /venv/bin/activate && pip install -r requirements.txt -RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages +RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.9/site-packages CMD . /venv/bin/activate && exec python monolith.py \ No newline at end of file diff --git a/sources/ch4.py b/sources/ch4.py index 324e8ea..c2aeaaa 100644 --- a/sources/ch4.py +++ b/sources/ch4.py @@ -14,6 +14,9 @@ import db import util from schemas.ch4_s import ATTRMAP +from numpy import array_split +from math import ceil + p = ProcessPoolExecutor(10) @@ -75,7 +78,16 @@ class Chan4(object): to_get.append((mapped, no)) self.log.info(f"Got thread list for {mapped}: {len(response)}") - await self.get_threads_content(to_get) + print("THREAD LIST FULL LEN", len(to_get)) + if not to_get: + await self.get_thread_lists(self.boards) + return + split_threads = array_split(to_get, ceil(len(to_get) / 10000)) + print("SPLIT THREADS INTO", len(split_threads)) + for threads in split_threads: + print("SUBMITTED THREADS FOR", len(threads)) + await self.get_threads_content(threads) + #await self.get_threads_content(to_get) # Recurse await self.get_thread_lists(self.boards) @@ -106,6 +118,7 @@ class Chan4(object): # await self.handle_posts(board, thread, response["posts"]) # await asyncio.sleep(1) await self.handle_posts_thread(all_posts) + # self.handle_posts(all_posts) @asyncio.coroutine def handle_posts_thread(self, posts): @@ -114,12 +127,9 @@ class Chan4(object): yield from loop.run_in_executor(p, self.handle_posts, posts) def handle_posts(self, posts): - print("HANDLE POSTS START") to_store = [] for key, post_list in posts.items(): board, thread = key - print("PROCESSING BOARD", board, "THREAD", thread) - print("POSTS HERE", len(post_list)) for index, post in enumerate(post_list): posts[key][index]["type"] = "msg" @@ -161,9 +171,7 @@ class Chan4(object): # print({name_map[name]: val for name, val in post.items()}) # print(f"Got posts: {len(posts)}") - print("HANDLE POSTS DONE") db.store_message_bulk(to_store) - print("STORE DB DONE") async def fetch(self, url, session, mapped): async with session.get(url) as response: