Don't muddle up the topics when sending Kafka batches
This commit is contained in:
parent
e0803d4934
commit
027c43b60a
46
db.py
46
db.py
|
@ -47,7 +47,7 @@ async def store_kafka_batch(data):
|
|||
log.debug(f"Storing Kafka batch of {len(data)} messages")
|
||||
producer = AIOKafkaProducer(bootstrap_servers="kafka:9092")
|
||||
await producer.start()
|
||||
batch = producer.create_batch()
|
||||
topicmap = {}
|
||||
for msg in data:
|
||||
if msg["type"] in TYPES_MAIN:
|
||||
# index = "main"
|
||||
|
@ -72,23 +72,39 @@ async def store_kafka_batch(data):
|
|||
# ):
|
||||
# msg[key] = str(value)
|
||||
body = orjson.dumps(msg)
|
||||
# orjson returns bytes
|
||||
# body = str.encode(message)
|
||||
if "ts" not in msg:
|
||||
raise Exception("No TS in msg")
|
||||
metadata = batch.append(key=None, value=body, timestamp=msg["ts"])
|
||||
if metadata is None:
|
||||
partitions = await producer.partitions_for(KAFKA_TOPIC)
|
||||
partition = random.choice(tuple(partitions))
|
||||
await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
|
||||
log.debug(f"{batch.record_count()} messages sent to partition {partition}")
|
||||
batch = producer.create_batch()
|
||||
continue
|
||||
if KAFKA_TOPIC not in topicmap:
|
||||
topicmap[KAFKA_TOPIC] = [body]
|
||||
else:
|
||||
topicmap[KAFKA_TOPIC].append(body)
|
||||
|
||||
partitions = await producer.partitions_for(KAFKA_TOPIC)
|
||||
partition = random.choice(tuple(partitions))
|
||||
await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
|
||||
log.debug(f"{batch.record_count()} messages sent to partition {partition}")
|
||||
for topic, messages in topicmap.items():
|
||||
batch = producer.create_batch()
|
||||
for body in messages:
|
||||
metadata = batch.append(key=None, value=body, timestamp=msg["ts"])
|
||||
if metadata is None:
|
||||
partitions = await producer.partitions_for(topic)
|
||||
partition = random.choice(tuple(partitions))
|
||||
await producer.send_batch(batch, topic, partition=partition)
|
||||
log.debug(
|
||||
(
|
||||
f"{batch.record_count()} messages sent to topic "
|
||||
f"{topic} partition {partition}"
|
||||
)
|
||||
)
|
||||
batch = producer.create_batch()
|
||||
continue
|
||||
|
||||
partitions = await producer.partitions_for(topic)
|
||||
partition = random.choice(tuple(partitions))
|
||||
await producer.send_batch(batch, topic, partition=partition)
|
||||
log.debug(
|
||||
(
|
||||
f"{batch.record_count()} messages sent to topic "
|
||||
f"{topic} partition {partition}"
|
||||
)
|
||||
)
|
||||
await producer.stop()
|
||||
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ import asyncio
|
|||
import random
|
||||
import string
|
||||
from math import ceil
|
||||
from os import getenv
|
||||
|
||||
import aiohttp
|
||||
from numpy import array_split
|
||||
|
@ -10,8 +11,6 @@ from numpy import array_split
|
|||
import db
|
||||
import util
|
||||
|
||||
from os import getenv
|
||||
|
||||
# CONFIGURATION #
|
||||
|
||||
# Number of 4chan threads to request at once
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import asyncio
|
||||
from os import getenv
|
||||
|
||||
import orjson
|
||||
|
||||
import db
|
||||
import util
|
||||
from processing import process
|
||||
from os import getenv
|
||||
|
||||
SOURCES = ["4ch", "irc", "dis"]
|
||||
KEYPREFIX = "queue."
|
||||
|
|
Loading…
Reference in New Issue