Begin implementing RTS
This commit is contained in:
@@ -8,9 +8,6 @@ import string
|
||||
# For timing
|
||||
import time
|
||||
|
||||
# For throttling
|
||||
import psutil
|
||||
|
||||
# Squash errors
|
||||
import warnings
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
@@ -50,6 +47,9 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||
import db
|
||||
import util
|
||||
|
||||
# For throttling
|
||||
from perf.throttle import DynamicThrottle
|
||||
|
||||
# 4chan schema
|
||||
from schemas.ch4_s import ATTRMAP
|
||||
|
||||
@@ -62,8 +62,6 @@ MONOLITH_PROCESS_PERFSTATS = (
|
||||
)
|
||||
TARGET_CPU_USAGE = float(os.getenv("MONOLITH_PROCESS_TARGET_CPU_USAGE", 50.0))
|
||||
|
||||
SLEEP_INTERVAL = 0.0
|
||||
|
||||
CUSTOM_FILTERS = [
|
||||
lambda x: x.lower(),
|
||||
strip_tags, #
|
||||
@@ -94,6 +92,19 @@ CPU_THREADS = int(os.getenv("MONOLITH_PROCESS_THREADS", os.cpu_count()))
|
||||
|
||||
p = ProcessPoolExecutor(CPU_THREADS)
|
||||
|
||||
throttle = DynamicThrottle(
|
||||
target_cpu_usage=TARGET_CPU_USAGE,
|
||||
sleep_increment=0.02,
|
||||
sleep_decrement=0.01,
|
||||
sleep_max=0.5,
|
||||
sleep_min=0,
|
||||
psutil_interval=0.1,
|
||||
consecutive_divisor=2,
|
||||
log=log,
|
||||
start_increment=True,
|
||||
use_async=False,
|
||||
)
|
||||
|
||||
|
||||
def get_hash_key():
|
||||
hash_key = db.r.get("hashing_key")
|
||||
@@ -136,7 +147,7 @@ async def spawn_processing_threads(chunk, length):
|
||||
# Join the results back from the split list
|
||||
flat_list = [item for sublist in results for item in sublist]
|
||||
total_messages = len(flat_list)
|
||||
log.debug(
|
||||
log.info(
|
||||
(
|
||||
f"[{chunk}/{index}] Results from processing of {length} messages in "
|
||||
f"{cores} threads: {len(flat_list)}"
|
||||
@@ -149,7 +160,6 @@ async def spawn_processing_threads(chunk, length):
|
||||
|
||||
|
||||
def process_data(chunk, index, chunk_size):
|
||||
global SLEEP_INTERVAL
|
||||
log.debug(f"[{chunk}/{index}] Processing {chunk_size} messages")
|
||||
to_store = []
|
||||
|
||||
@@ -159,7 +169,6 @@ def process_data(chunk, index, chunk_size):
|
||||
date_time = 0.0
|
||||
nlp_time = 0.0
|
||||
normalise_time = 0.0
|
||||
hash_time = 0.0
|
||||
normal2_time = 0.0
|
||||
soup_time = 0.0
|
||||
sleep_time = 0.0
|
||||
@@ -170,11 +179,28 @@ def process_data(chunk, index, chunk_size):
|
||||
analyzer = SentimentIntensityAnalyzer()
|
||||
|
||||
for msg_index in range(chunk_size):
|
||||
# Print percentage of msg_index relative to chunk_size
|
||||
if msg_index % 10 == 0:
|
||||
percentage_done = (msg_index / chunk_size) * 100
|
||||
log.debug(
|
||||
f"[{chunk}/{index}] {percentage_done:.2f}% done ({msg_index}/{chunk_size})"
|
||||
)
|
||||
|
||||
msg = db.r.rpop(KEYNAME)
|
||||
if not msg:
|
||||
return
|
||||
# TODO: msgpack
|
||||
msg = orjson.loads(msg)
|
||||
if msg["src"] == "4ch":
|
||||
board = msg["net"]
|
||||
thread = msg["channel"]
|
||||
redis_key = (
|
||||
f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
|
||||
)
|
||||
key_content = db.r.get(redis_key)
|
||||
if key_content is not None:
|
||||
continue
|
||||
db.r.set(redis_key, "1")
|
||||
|
||||
total_start = time.process_time()
|
||||
# normalise fields
|
||||
start = time.process_time()
|
||||
@@ -200,29 +226,6 @@ def process_data(chunk, index, chunk_size):
|
||||
board = msg["net"]
|
||||
thread = msg["channel"]
|
||||
|
||||
# Calculate hash for post
|
||||
start = time.process_time()
|
||||
post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
|
||||
hash = siphash(hash_key, post_normalised)
|
||||
hash = str(hash)
|
||||
redis_key = (
|
||||
f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
|
||||
)
|
||||
key_content = db.r.get(redis_key)
|
||||
if key_content is not None:
|
||||
key_content = key_content.decode("ascii")
|
||||
if key_content == hash:
|
||||
# This deletes the message since the append at the end won't be hit
|
||||
continue
|
||||
# pass
|
||||
else:
|
||||
# msg["type"] = "update"
|
||||
# Fuck it, updates just brew spam
|
||||
continue
|
||||
db.r.set(redis_key, hash)
|
||||
time_took = (time.process_time() - start) * 1000
|
||||
hash_time += time_took
|
||||
|
||||
start = time.process_time()
|
||||
for key2, value in list(msg.items()):
|
||||
if key2 in ATTRMAP:
|
||||
@@ -240,9 +243,10 @@ def process_data(chunk, index, chunk_size):
|
||||
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
|
||||
else:
|
||||
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
|
||||
# new_ts = old_ts.isoformat()
|
||||
# iso_ts = old_ts.isoformat()
|
||||
new_ts = int(old_ts.timestamp())
|
||||
msg["ts"] = new_ts
|
||||
# msg["iso"] = iso_ts
|
||||
else:
|
||||
raise Exception("No TS in msg")
|
||||
time_took = (time.process_time() - start) * 1000
|
||||
@@ -302,39 +306,22 @@ def process_data(chunk, index, chunk_size):
|
||||
|
||||
# Dynamic throttling to reduce CPU usage
|
||||
if msg_index % 5 == 0:
|
||||
current_cpu_usage = psutil.cpu_percent(interval=0.2)
|
||||
if current_cpu_usage > TARGET_CPU_USAGE:
|
||||
SLEEP_INTERVAL += 0.02
|
||||
if SLEEP_INTERVAL > 0.5:
|
||||
SLEEP_INTERVAL = 0.5
|
||||
log.info(
|
||||
f"CPU {current_cpu_usage}% > {TARGET_CPU_USAGE}%, "
|
||||
f"=> sleep {SLEEP_INTERVAL:.3f}s"
|
||||
)
|
||||
elif current_cpu_usage < TARGET_CPU_USAGE and SLEEP_INTERVAL > 0.01:
|
||||
SLEEP_INTERVAL -= 0.01
|
||||
log.info(
|
||||
f"CPU {current_cpu_usage}% < {TARGET_CPU_USAGE}%, "
|
||||
f"=> sleep {SLEEP_INTERVAL:.3f}s"
|
||||
)
|
||||
time.sleep(SLEEP_INTERVAL)
|
||||
sleep_time += SLEEP_INTERVAL
|
||||
sleep_time += throttle.wait()
|
||||
|
||||
if MONOLITH_PROCESS_PERFSTATS:
|
||||
log.debug("=====================================")
|
||||
log.debug(f"Chunk: {chunk}")
|
||||
log.debug(f"Index: {index}")
|
||||
log.debug(f"Sentiment: {sentiment_time}")
|
||||
log.debug(f"Regex: {regex_time}")
|
||||
log.debug(f"Polyglot: {polyglot_time}")
|
||||
log.debug(f"Date: {date_time}")
|
||||
log.debug(f"NLP: {nlp_time}")
|
||||
log.debug(f"Normalise: {normalise_time}")
|
||||
log.debug(f"Hash: {hash_time}")
|
||||
log.debug(f"Normal2: {normal2_time}")
|
||||
log.debug(f"Soup: {soup_time}")
|
||||
log.debug(f"Total: {total_time}")
|
||||
log.debug(f"Throttling: {sleep_time}")
|
||||
log.debug("=====================================")
|
||||
log.info("=====================================")
|
||||
log.info(f"Chunk: {chunk}")
|
||||
log.info(f"Index: {index}")
|
||||
log.info(f"Sentiment: {sentiment_time}")
|
||||
log.info(f"Regex: {regex_time}")
|
||||
log.info(f"Polyglot: {polyglot_time}")
|
||||
log.info(f"Date: {date_time}")
|
||||
log.info(f"NLP: {nlp_time}")
|
||||
log.info(f"Normalise: {normalise_time}")
|
||||
log.info(f"Normal2: {normal2_time}")
|
||||
log.info(f"Soup: {soup_time}")
|
||||
log.info(f"Total: {total_time}")
|
||||
log.info(f"Throttling: {sleep_time}")
|
||||
log.info("=====================================")
|
||||
|
||||
return to_store
|
||||
|
||||
Reference in New Issue
Block a user