Begin implementing RTS

2026-02-17 12:14:29 +00:00
parent dc533f266f
commit 81f05d4263
14 changed files with 484 additions and 268 deletions
--- a/processing/process.py
+++ b/processing/process.py
@@ -8,9 +8,6 @@ import string
 # For timing
 import time

-# For throttling
-import psutil
-
 # Squash errors
 import warnings
 from concurrent.futures import ProcessPoolExecutor
@@ -50,6 +47,9 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 import db
 import util

+# For throttling
+from perf.throttle import DynamicThrottle
+
 # 4chan schema
 from schemas.ch4_s import ATTRMAP

@@ -62,8 +62,6 @@ MONOLITH_PROCESS_PERFSTATS = (
 )
 TARGET_CPU_USAGE = float(os.getenv("MONOLITH_PROCESS_TARGET_CPU_USAGE", 50.0))

-SLEEP_INTERVAL = 0.0
-
 CUSTOM_FILTERS = [
    lambda x: x.lower(),
    strip_tags,  #
@@ -94,6 +92,19 @@ CPU_THREADS = int(os.getenv("MONOLITH_PROCESS_THREADS", os.cpu_count()))

 p = ProcessPoolExecutor(CPU_THREADS)

+throttle = DynamicThrottle(
+    target_cpu_usage=TARGET_CPU_USAGE,
+    sleep_increment=0.02,
+    sleep_decrement=0.01,
+    sleep_max=0.5,
+    sleep_min=0,
+    psutil_interval=0.1,
+    consecutive_divisor=2,
+    log=log,
+    start_increment=True,
+    use_async=False,
+)
+

 def get_hash_key():
    hash_key = db.r.get("hashing_key")
@@ -136,7 +147,7 @@ async def spawn_processing_threads(chunk, length):
    # Join the results back from the split list
    flat_list = [item for sublist in results for item in sublist]
    total_messages = len(flat_list)
-    log.debug(
+    log.info(
        (
            f"[{chunk}/{index}] Results from processing of {length} messages in "
            f"{cores} threads: {len(flat_list)}"
@@ -149,7 +160,6 @@ async def spawn_processing_threads(chunk, length):


 def process_data(chunk, index, chunk_size):
-    global SLEEP_INTERVAL
    log.debug(f"[{chunk}/{index}] Processing {chunk_size} messages")
    to_store = []

@@ -159,7 +169,6 @@ def process_data(chunk, index, chunk_size):
    date_time = 0.0
    nlp_time = 0.0
    normalise_time = 0.0
-    hash_time = 0.0
    normal2_time = 0.0
    soup_time = 0.0
    sleep_time = 0.0
@@ -170,11 +179,28 @@ def process_data(chunk, index, chunk_size):
    analyzer = SentimentIntensityAnalyzer()

    for msg_index in range(chunk_size):
+        # Print percentage of msg_index relative to chunk_size
+        if msg_index % 10 == 0:
+            percentage_done = (msg_index / chunk_size) * 100
+            log.debug(
+                f"[{chunk}/{index}] {percentage_done:.2f}% done ({msg_index}/{chunk_size})"
+            )
+
        msg = db.r.rpop(KEYNAME)
        if not msg:
            return
-        # TODO: msgpack
        msg = orjson.loads(msg)
+        if msg["src"] == "4ch":
+            board = msg["net"]
+            thread = msg["channel"]
+            redis_key = (
+                f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
+            )
+            key_content = db.r.get(redis_key)
+            if key_content is not None:
+                continue
+        db.r.set(redis_key, "1")
+
        total_start = time.process_time()
        # normalise fields
        start = time.process_time()
@@ -200,29 +226,6 @@ def process_data(chunk, index, chunk_size):
            board = msg["net"]
            thread = msg["channel"]

-            # Calculate hash for post
-            start = time.process_time()
-            post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
-            hash = siphash(hash_key, post_normalised)
-            hash = str(hash)
-            redis_key = (
-                f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
-            )
-            key_content = db.r.get(redis_key)
-            if key_content is not None:
-                key_content = key_content.decode("ascii")
-                if key_content == hash:
-                    # This deletes the message since the append at the end won't be hit
-                    continue
-                    # pass
-                else:
-                    # msg["type"] = "update"
-                    # Fuck it, updates just brew spam
-                    continue
-            db.r.set(redis_key, hash)
-            time_took = (time.process_time() - start) * 1000
-            hash_time += time_took
-
            start = time.process_time()
            for key2, value in list(msg.items()):
                if key2 in ATTRMAP:
@@ -240,9 +243,10 @@ def process_data(chunk, index, chunk_size):
                    old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
                else:
                    old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
-                # new_ts = old_ts.isoformat()
+                # iso_ts = old_ts.isoformat()
                new_ts = int(old_ts.timestamp())
                msg["ts"] = new_ts
+                # msg["iso"] = iso_ts
            else:
                raise Exception("No TS in msg")
            time_took = (time.process_time() - start) * 1000
@@ -302,39 +306,22 @@ def process_data(chunk, index, chunk_size):

        # Dynamic throttling to reduce CPU usage
        if msg_index % 5 == 0:
-            current_cpu_usage = psutil.cpu_percent(interval=0.2)
-            if current_cpu_usage > TARGET_CPU_USAGE:
-                SLEEP_INTERVAL += 0.02
-                if SLEEP_INTERVAL > 0.5:
-                    SLEEP_INTERVAL = 0.5
-                log.info(
-                    f"CPU {current_cpu_usage}% > {TARGET_CPU_USAGE}%, "
-                    f"=> sleep {SLEEP_INTERVAL:.3f}s"
-                )
-            elif current_cpu_usage < TARGET_CPU_USAGE and SLEEP_INTERVAL > 0.01:
-                SLEEP_INTERVAL -= 0.01
-                log.info(
-                    f"CPU {current_cpu_usage}% < {TARGET_CPU_USAGE}%, "
-                    f"=> sleep {SLEEP_INTERVAL:.3f}s"
-                )
-            time.sleep(SLEEP_INTERVAL)
-            sleep_time += SLEEP_INTERVAL
+            sleep_time += throttle.wait()

    if MONOLITH_PROCESS_PERFSTATS:
-        log.debug("=====================================")
-        log.debug(f"Chunk: {chunk}")
-        log.debug(f"Index: {index}")
-        log.debug(f"Sentiment: {sentiment_time}")
-        log.debug(f"Regex: {regex_time}")
-        log.debug(f"Polyglot: {polyglot_time}")
-        log.debug(f"Date: {date_time}")
-        log.debug(f"NLP: {nlp_time}")
-        log.debug(f"Normalise: {normalise_time}")
-        log.debug(f"Hash: {hash_time}")
-        log.debug(f"Normal2: {normal2_time}")
-        log.debug(f"Soup: {soup_time}")
-        log.debug(f"Total: {total_time}")
-        log.debug(f"Throttling: {sleep_time}")
-        log.debug("=====================================")
+        log.info("=====================================")
+        log.info(f"Chunk: {chunk}")
+        log.info(f"Index: {index}")
+        log.info(f"Sentiment: {sentiment_time}")
+        log.info(f"Regex: {regex_time}")
+        log.info(f"Polyglot: {polyglot_time}")
+        log.info(f"Date: {date_time}")
+        log.info(f"NLP: {nlp_time}")
+        log.info(f"Normalise: {normalise_time}")
+        log.info(f"Normal2: {normal2_time}")
+        log.info(f"Soup: {soup_time}")
+        log.info(f"Total: {total_time}")
+        log.info(f"Throttling: {sleep_time}")
+        log.info("=====================================")

    return to_store