Use only one Redis key for the queue to make chunk size more precise for thread allocation

This commit is contained in:
Mark Veidemanis 2022-09-30 07:22:22 +01:00
parent a5d29606e9
commit 02ff44a6f5
7 changed files with 25 additions and 26 deletions

14
db.py
View File

@ -40,7 +40,7 @@ MAIN_SRC_MAP = {
TYPES_META = ["who"]
TYPES_INT = ["conn", "highlight", "znc", "query", "self"]
KEYPREFIX = "queue."
KEYNAME = "queue"
async def store_kafka_batch(data):
@ -113,12 +113,12 @@ async def queue_message(msg):
"""
Queue a message on the Redis buffer.
"""
src = msg["src"]
# src = msg["src"]
message = orjson.dumps(msg)
key = f"{KEYPREFIX}{src}"
# key = f"{KEYPREFIX}{src}"
# log.debug(f"Queueing single message of string length {len(message)}")
await ar.sadd(key, message)
await ar.sadd(KEYNAME, message)
async def queue_message_bulk(data):
@ -127,8 +127,8 @@ async def queue_message_bulk(data):
"""
# log.debug(f"Queueing message batch of length {len(data)}")
for msg in data:
src = msg["src"]
# src = msg["src"]
message = orjson.dumps(msg)
key = f"{KEYPREFIX}{src}"
await ar.sadd(key, message)
# key = f"{KEYPREFIX}{src}"
await ar.sadd(KEYNAME, message)

View File

@ -127,11 +127,11 @@ services:
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
KAFKA_MESSAGE_MAX_BYTES: 2000000
KAFKA_HEAP_OPTS: -Xmx2g
#KAFKA_HEAP_OPTS: -Xmx2g
healthcheck:
test: ["CMD", "kafka-topics.sh", "--list", "--bootstrap-server", "kafka:9092"]
start_period: 15s
interval: 2s
interval: 30s
timeout: 30s
retries: 45

View File

@ -121,11 +121,11 @@ services:
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
KAFKA_MESSAGE_MAX_BYTES: 2000000
KAFKA_HEAP_OPTS: -Xmx2g
#KAFKA_HEAP_OPTS: -Xmx2g
healthcheck:
test: ["CMD", "kafka-topics.sh", "--list", "--bootstrap-server", "kafka:9092"]
start_period: 15s
interval: 2s
interval: 30s
timeout: 30s
retries: 45

View File

@ -67,7 +67,7 @@ def parsemeta(numName, c):
def queue_message(c):
message = json.dumps(c)
main.g.sadd(main.config["Ingest"]["Key"], message)
main.g.sadd("queue", message)
def event(

View File

@ -1,5 +1,4 @@
import asyncio
import sys
from os import getenv
import uvloop

View File

@ -101,18 +101,17 @@ hash_key = get_hash_key()
@asyncio.coroutine
async def spawn_processing_threads(data):
len_data = len(data)
# log.debug(f"Spawning processing threads for batch of {len_data} messages")
loop = asyncio.get_event_loop()
tasks = []
if len(data) < CPU_THREADS:
if len(data) < CPU_THREADS * 100:
split_data = [data]
else:
msg_per_core = int(len(data) / CPU_THREADS)
split_data = array_split(data, ceil(len(data) / msg_per_core))
for index, split in enumerate(split_data):
# log.debug(f"Delegating processing of {len(split)} messages to thread {index}")
log.debug(f"Delegating processing of {len(split)} messages to thread {index}")
task = loop.run_in_executor(p, process_data, split)
tasks.append(task)

View File

@ -7,12 +7,13 @@ import db
import util
from processing import process
SOURCES = ["4ch", "irc", "dis"]
# SOURCES = ["4ch", "irc", "dis"]
# DEBUG CODE REMOVE ME
# SOURCES.remove("4ch")
# SOURCES.remove("dis")
# DEBUG CODE REMOVE ME
KEYPREFIX = "queue."
# KEYPREFIX = "queue."
KEYNAME = "queue"
# Chunk size per source (divide by len(SOURCES) for total)
CHUNK_SIZE = int(getenv("MONOLITH_INGEST_CHUNK_SIZE", "900"))
@ -39,13 +40,13 @@ class Ingest(object):
async def get_chunk(self):
items = []
for source in SOURCES:
key = f"{KEYPREFIX}{source}"
chunk = await db.ar.spop(key, CHUNK_SIZE)
if not chunk:
continue
for item in chunk:
item = orjson.loads(item)
items.append(item)
# for source in SOURCES:
# key = f"{KEYPREFIX}{source}"
chunk = await db.ar.spop(KEYNAME, CHUNK_SIZE)
if not chunk:
return
for item in chunk:
item = orjson.loads(item)
items.append(item)
if items:
await process.spawn_processing_threads(items)