Add throttling for performance

This commit is contained in:
2025-01-24 12:17:22 +00:00
parent 352909bec0
commit 54ecfbae64
3 changed files with 101 additions and 2 deletions

View File

@@ -8,6 +8,8 @@ from os import getenv
import aiohttp
from numpy import array_split
import psutil
import db
import util
@@ -25,12 +27,14 @@ CRAWL_DELAY = int(getenv("MONOLITH_CH4_CRAWL_DELAY", 5))
# Semaphore value ?
THREADS_SEMAPHORE = int(getenv("MONOLITH_CH4_THREADS_SEMAPHORE", 1000))
# Target CPU usage percentage
TARGET_CPU_USAGE = float(getenv("MONOLITH_CH4_TARGET_CPU_USAGE", 50.0))
# Boards to crawl
BOARDS = getenv("MONOLITH_CH4_BOARDS", "").split(",")
# CONFIGURATION END #
class Chan4(object):
"""
4chan indexer, crawler and ingester.
@@ -40,6 +44,8 @@ class Chan4(object):
name = self.__class__.__name__
self.log = util.get_logger(name)
self.sleep_interval = 0.0
self.api_endpoint = "https://a.4cdn.org"
# self.boards = ["out", "g", "a", "3", "pol"] #
self.boards = []
@@ -59,6 +65,33 @@ class Chan4(object):
self.hash_key = self.hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {self.hash_key}")
async def dynamic_throttle(self):
"""
Dynamically sleeps before a request if CPU usage is above our target.
Also, if CPU usage is far below the target, reduce the sleep time.
Caps the sleep interval at 0.2s.
Prints CPU usage and sleep interval like process.py.
"""
current_cpu_usage = psutil.cpu_percent(interval=0.2)
if current_cpu_usage > TARGET_CPU_USAGE:
self.sleep_interval += 0.01
if self.sleep_interval > 0.1:
self.sleep_interval = 0.1
self.log.info(
f"CPU {current_cpu_usage}% > {TARGET_CPU_USAGE}%, "
f"=> sleep {self.sleep_interval:.3f}s"
)
elif current_cpu_usage < TARGET_CPU_USAGE and self.sleep_interval > 0.01:
self.sleep_interval -= 0.01
self.log.info(
f"CPU {current_cpu_usage}% < {TARGET_CPU_USAGE}%, "
f"=> sleep {self.sleep_interval:.3f}s"
)
if self.sleep_interval > 0:
await asyncio.sleep(self.sleep_interval)
async def run(self):
if "ALL" in BOARDS:
await self.get_board_list()
@@ -76,6 +109,8 @@ class Chan4(object):
for board in response["boards"]:
self.boards.append(board["board"])
self.log.debug(f"Got boards: {self.boards}")
# await self.dynamic_throttle()
# TODO
async def get_thread_lists(self, boards):
# self.log.debug(f"Getting thread list for {boards}")
@@ -91,6 +126,8 @@ class Chan4(object):
for threads in page["threads"]:
no = threads["no"]
to_get.append((board, no))
# await self.dynamic_throttle()
# TODO
if not to_get:
return
@@ -100,6 +137,8 @@ class Chan4(object):
for index, thr in enumerate(split_threads):
self.log.debug(f"Series {index} - getting {len(thr)} threads")
await self.get_threads_content(thr)
# await self.dynamic_throttle()
# TODO
await asyncio.sleep(THREADS_DELAY)
def take_items(self, dict_list, n):
@@ -130,6 +169,8 @@ class Chan4(object):
continue
board, thread = mapped
all_posts[mapped] = response["posts"]
# await self.dynamic_throttle()
# TODO
if not all_posts:
return
@@ -147,6 +188,8 @@ class Chan4(object):
post["channel"] = thread
to_store.append(post)
# await self.dynamic_throttle()
# TODO
if to_store:
await db.queue_message_bulk(to_store)
@@ -161,6 +204,7 @@ class Chan4(object):
async def bound_fetch(self, sem, url, session, mapped):
# Getter function with semaphore.
async with sem:
await self.dynamic_throttle()
try:
return await self.fetch(url, session, mapped)
except: # noqa