Begin implementing RTS

This commit is contained in:
2026-02-17 12:14:29 +00:00
parent dc533f266f
commit 81f05d4263
14 changed files with 484 additions and 268 deletions

View File

@@ -8,9 +8,6 @@ import string
# For timing
import time
# For throttling
import psutil
# Squash errors
import warnings
from concurrent.futures import ProcessPoolExecutor
@@ -50,6 +47,9 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import db
import util
# For throttling
from perf.throttle import DynamicThrottle
# 4chan schema
from schemas.ch4_s import ATTRMAP
@@ -62,8 +62,6 @@ MONOLITH_PROCESS_PERFSTATS = (
)
TARGET_CPU_USAGE = float(os.getenv("MONOLITH_PROCESS_TARGET_CPU_USAGE", 50.0))
SLEEP_INTERVAL = 0.0
CUSTOM_FILTERS = [
lambda x: x.lower(),
strip_tags, #
@@ -94,6 +92,19 @@ CPU_THREADS = int(os.getenv("MONOLITH_PROCESS_THREADS", os.cpu_count()))
p = ProcessPoolExecutor(CPU_THREADS)
throttle = DynamicThrottle(
target_cpu_usage=TARGET_CPU_USAGE,
sleep_increment=0.02,
sleep_decrement=0.01,
sleep_max=0.5,
sleep_min=0,
psutil_interval=0.1,
consecutive_divisor=2,
log=log,
start_increment=True,
use_async=False,
)
def get_hash_key():
hash_key = db.r.get("hashing_key")
@@ -136,7 +147,7 @@ async def spawn_processing_threads(chunk, length):
# Join the results back from the split list
flat_list = [item for sublist in results for item in sublist]
total_messages = len(flat_list)
log.debug(
log.info(
(
f"[{chunk}/{index}] Results from processing of {length} messages in "
f"{cores} threads: {len(flat_list)}"
@@ -149,7 +160,6 @@ async def spawn_processing_threads(chunk, length):
def process_data(chunk, index, chunk_size):
global SLEEP_INTERVAL
log.debug(f"[{chunk}/{index}] Processing {chunk_size} messages")
to_store = []
@@ -159,7 +169,6 @@ def process_data(chunk, index, chunk_size):
date_time = 0.0
nlp_time = 0.0
normalise_time = 0.0
hash_time = 0.0
normal2_time = 0.0
soup_time = 0.0
sleep_time = 0.0
@@ -170,11 +179,28 @@ def process_data(chunk, index, chunk_size):
analyzer = SentimentIntensityAnalyzer()
for msg_index in range(chunk_size):
# Print percentage of msg_index relative to chunk_size
if msg_index % 10 == 0:
percentage_done = (msg_index / chunk_size) * 100
log.debug(
f"[{chunk}/{index}] {percentage_done:.2f}% done ({msg_index}/{chunk_size})"
)
msg = db.r.rpop(KEYNAME)
if not msg:
return
# TODO: msgpack
msg = orjson.loads(msg)
if msg["src"] == "4ch":
board = msg["net"]
thread = msg["channel"]
redis_key = (
f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
)
key_content = db.r.get(redis_key)
if key_content is not None:
continue
db.r.set(redis_key, "1")
total_start = time.process_time()
# normalise fields
start = time.process_time()
@@ -200,29 +226,6 @@ def process_data(chunk, index, chunk_size):
board = msg["net"]
thread = msg["channel"]
# Calculate hash for post
start = time.process_time()
post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
hash = siphash(hash_key, post_normalised)
hash = str(hash)
redis_key = (
f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
)
key_content = db.r.get(redis_key)
if key_content is not None:
key_content = key_content.decode("ascii")
if key_content == hash:
# This deletes the message since the append at the end won't be hit
continue
# pass
else:
# msg["type"] = "update"
# Fuck it, updates just brew spam
continue
db.r.set(redis_key, hash)
time_took = (time.process_time() - start) * 1000
hash_time += time_took
start = time.process_time()
for key2, value in list(msg.items()):
if key2 in ATTRMAP:
@@ -240,9 +243,10 @@ def process_data(chunk, index, chunk_size):
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
else:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
# new_ts = old_ts.isoformat()
# iso_ts = old_ts.isoformat()
new_ts = int(old_ts.timestamp())
msg["ts"] = new_ts
# msg["iso"] = iso_ts
else:
raise Exception("No TS in msg")
time_took = (time.process_time() - start) * 1000
@@ -302,39 +306,22 @@ def process_data(chunk, index, chunk_size):
# Dynamic throttling to reduce CPU usage
if msg_index % 5 == 0:
current_cpu_usage = psutil.cpu_percent(interval=0.2)
if current_cpu_usage > TARGET_CPU_USAGE:
SLEEP_INTERVAL += 0.02
if SLEEP_INTERVAL > 0.5:
SLEEP_INTERVAL = 0.5
log.info(
f"CPU {current_cpu_usage}% > {TARGET_CPU_USAGE}%, "
f"=> sleep {SLEEP_INTERVAL:.3f}s"
)
elif current_cpu_usage < TARGET_CPU_USAGE and SLEEP_INTERVAL > 0.01:
SLEEP_INTERVAL -= 0.01
log.info(
f"CPU {current_cpu_usage}% < {TARGET_CPU_USAGE}%, "
f"=> sleep {SLEEP_INTERVAL:.3f}s"
)
time.sleep(SLEEP_INTERVAL)
sleep_time += SLEEP_INTERVAL
sleep_time += throttle.wait()
if MONOLITH_PROCESS_PERFSTATS:
log.debug("=====================================")
log.debug(f"Chunk: {chunk}")
log.debug(f"Index: {index}")
log.debug(f"Sentiment: {sentiment_time}")
log.debug(f"Regex: {regex_time}")
log.debug(f"Polyglot: {polyglot_time}")
log.debug(f"Date: {date_time}")
log.debug(f"NLP: {nlp_time}")
log.debug(f"Normalise: {normalise_time}")
log.debug(f"Hash: {hash_time}")
log.debug(f"Normal2: {normal2_time}")
log.debug(f"Soup: {soup_time}")
log.debug(f"Total: {total_time}")
log.debug(f"Throttling: {sleep_time}")
log.debug("=====================================")
log.info("=====================================")
log.info(f"Chunk: {chunk}")
log.info(f"Index: {index}")
log.info(f"Sentiment: {sentiment_time}")
log.info(f"Regex: {regex_time}")
log.info(f"Polyglot: {polyglot_time}")
log.info(f"Date: {date_time}")
log.info(f"NLP: {nlp_time}")
log.info(f"Normalise: {normalise_time}")
log.info(f"Normal2: {normal2_time}")
log.info(f"Soup: {soup_time}")
log.info(f"Total: {total_time}")
log.info(f"Throttling: {sleep_time}")
log.info("=====================================")
return to_store