Add throttling for performance

2025-01-24 12:17:22 +00:00
parent 352909bec0
commit 54ecfbae64
3 changed files with 101 additions and 2 deletions
--- a/sources/ch4.py
+++ b/sources/ch4.py
@@ -8,6 +8,8 @@ from os import getenv
 import aiohttp
 from numpy import array_split

+import psutil
+
 import db
 import util

@@ -25,12 +27,14 @@ CRAWL_DELAY = int(getenv("MONOLITH_CH4_CRAWL_DELAY", 5))
 # Semaphore value ?
 THREADS_SEMAPHORE = int(getenv("MONOLITH_CH4_THREADS_SEMAPHORE", 1000))

+# Target CPU usage percentage
+TARGET_CPU_USAGE = float(getenv("MONOLITH_CH4_TARGET_CPU_USAGE", 50.0))
+
 # Boards to crawl
 BOARDS = getenv("MONOLITH_CH4_BOARDS", "").split(",")

 # CONFIGURATION END #

-
 class Chan4(object):
    """
    4chan indexer, crawler and ingester.
@@ -40,6 +44,8 @@ class Chan4(object):
        name = self.__class__.__name__
        self.log = util.get_logger(name)

+        self.sleep_interval = 0.0
+
        self.api_endpoint = "https://a.4cdn.org"
        # self.boards = ["out", "g", "a", "3", "pol"] #
        self.boards = []
@@ -59,6 +65,33 @@ class Chan4(object):
            self.hash_key = self.hash_key.decode("ascii")
            self.log.debug(f"Decoded hash key: {self.hash_key}")

+    async def dynamic_throttle(self):
+        """
+        Dynamically sleeps before a request if CPU usage is above our target.
+        Also, if CPU usage is far below the target, reduce the sleep time.
+        Caps the sleep interval at 0.2s.
+        Prints CPU usage and sleep interval like process.py.
+        """
+        current_cpu_usage = psutil.cpu_percent(interval=0.2)
+
+        if current_cpu_usage > TARGET_CPU_USAGE:
+            self.sleep_interval += 0.01
+            if self.sleep_interval > 0.1:
+                self.sleep_interval = 0.1
+            self.log.info(
+                f"CPU {current_cpu_usage}% > {TARGET_CPU_USAGE}%, "
+                f"=> sleep {self.sleep_interval:.3f}s"
+            )
+        elif current_cpu_usage < TARGET_CPU_USAGE and self.sleep_interval > 0.01:
+            self.sleep_interval -= 0.01
+            self.log.info(
+                f"CPU {current_cpu_usage}% < {TARGET_CPU_USAGE}%, "
+                f"=> sleep {self.sleep_interval:.3f}s"
+            )
+
+        if self.sleep_interval > 0:
+            await asyncio.sleep(self.sleep_interval)
+
    async def run(self):
        if "ALL" in BOARDS:
            await self.get_board_list()
@@ -76,6 +109,8 @@ class Chan4(object):
            for board in response["boards"]:
                self.boards.append(board["board"])
            self.log.debug(f"Got boards: {self.boards}")
+            # await self.dynamic_throttle()
+            # TODO

    async def get_thread_lists(self, boards):
        # self.log.debug(f"Getting thread list for {boards}")
@@ -91,6 +126,8 @@ class Chan4(object):
                for threads in page["threads"]:
                    no = threads["no"]
                    to_get.append((board, no))
+                    # await self.dynamic_throttle()
+                    # TODO

        if not to_get:
            return
@@ -100,6 +137,8 @@ class Chan4(object):
        for index, thr in enumerate(split_threads):
            self.log.debug(f"Series {index} - getting {len(thr)} threads")
            await self.get_threads_content(thr)
+            # await self.dynamic_throttle()
+            # TODO
            await asyncio.sleep(THREADS_DELAY)

    def take_items(self, dict_list, n):
@@ -130,6 +169,8 @@ class Chan4(object):
                continue
            board, thread = mapped
            all_posts[mapped] = response["posts"]
+            # await self.dynamic_throttle()
+            # TODO

        if not all_posts:
            return
@@ -147,6 +188,8 @@ class Chan4(object):
                post["channel"] = thread

                to_store.append(post)
+                # await self.dynamic_throttle()
+                # TODO

        if to_store:
            await db.queue_message_bulk(to_store)
@@ -161,6 +204,7 @@ class Chan4(object):
    async def bound_fetch(self, sem, url, session, mapped):
        # Getter function with semaphore.
        async with sem:
+            await self.dynamic_throttle()
            try:
                return await self.fetch(url, session, mapped)
            except:  # noqa