Fully implement Elasticsearch indexing

This commit is contained in:
Mark Veidemanis 2022-11-22 20:15:02 +00:00
parent 052631c71f
commit 49f46c33ba
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
4 changed files with 78 additions and 55 deletions

111
db.py
View File

@ -1,20 +1,17 @@
import random
from os import getenv from os import getenv
import aioredis import aioredis
import orjson import orjson
import redis import redis
# Kafka # Elasticsearch
from aiokafka import AIOKafkaProducer from elasticsearch import AsyncElasticsearch
import util import util
trues = ("true", "1", "t", True) trues = ("true", "1", "t", True)
MONOLITH_KAFKA_ENABLED = getenv("MONOLITH_KAFKA_ENABLED", "false").lower() in trues # INDEX = "msg"
# KAFKA_TOPIC = "msg"
log = util.get_logger("db") log = util.get_logger("db")
@ -47,15 +44,54 @@ TYPES_META = ["who"]
TYPES_INT = ["conn", "highlight", "znc", "query", "self"] TYPES_INT = ["conn", "highlight", "znc", "query", "self"]
KEYNAME = "queue" KEYNAME = "queue"
ELASTICSEARCH_USERNAME = getenv("ELASTICSEARCH_USERNAME", "elastic")
ELASTICSEARCH_PASSWORD = getenv("ELASTICSEARCH_PASSWORD", "changeme")
ELASTICSEARCH_HOST = getenv("ELASTICSEARCH_HOST", "localhost")
ELASTICSEARCH_PORT = int(getenv("ELASTICSEARCH_PORT", "9200"))
ELASTICSEARCH_TLS = getenv("ELASTICSEARCH_TLS", "false") in trues
async def store_kafka_batch(data): client = None
if not MONOLITH_KAFKA_ENABLED:
log.info(f"Not storing Kafka batch of length {len(data)}, Kafka is disabled.") # These are sometimes numeric, sometimes strings.
return # If they are seen to be numeric first, ES will erroneously
# log.debug(f"Storing Kafka batch of {len(data)} messages") # index them as "long" and then subsequently fail to index messages
producer = AIOKafkaProducer(bootstrap_servers="kafka:9092") # with strings in the field.
await producer.start() keyword_fields = ["nick_id", "user_id", "net_id"]
topicmap = {}
mapping = {
"mappings": {
"properties": {
"ts": {"type": "date", "format": "epoch_second"},
"file_tim": {"type": "date", "format": "epoch_millis"},
}
}
}
for field in keyword_fields:
mapping["mappings"]["properties"][field] = {"type": "text"}
async def initialise_elasticsearch():
"""
Initialise the Elasticsearch client.
"""
auth = (ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD)
client = AsyncElasticsearch(ELASTICSEARCH_HOST, http_auth=auth, verify_certs=False)
for index in ("main", "restricted"):
if await client.indices.exists(index=index):
# update index with mapping
await client.indices.put_mapping(
index=index, properties=mapping["mappings"]["properties"]
)
else:
await client.indices.create(index=index, mappings=mapping["mappings"])
return client
async def store_batch(data):
global client
if not client:
client = await initialise_elasticsearch()
indexmap = {}
for msg in data: for msg in data:
if msg["type"] in TYPES_MAIN: if msg["type"] in TYPES_MAIN:
# index = "main" # index = "main"
@ -68,7 +104,7 @@ async def store_kafka_batch(data):
index = "internal" index = "internal"
# schema = mc_s.schema_int # schema = mc_s.schema_int
KAFKA_TOPIC = index INDEX = index
# if key in schema: # if key in schema:
# if isinstance(value, int): # if isinstance(value, int):
@ -76,45 +112,20 @@ async def store_kafka_batch(data):
# "text" # "text"
# ): # ):
# msg[key] = str(value) # msg[key] = str(value)
body = orjson.dumps(msg) # body = orjson.dumps(msg)
if "ts" not in msg: if "ts" not in msg:
raise Exception("No TS in msg") raise Exception("No TS in msg")
if KAFKA_TOPIC not in topicmap: if INDEX not in indexmap:
topicmap[KAFKA_TOPIC] = [body] indexmap[INDEX] = [msg]
else: else:
topicmap[KAFKA_TOPIC].append(body) indexmap[INDEX].append(msg)
for topic, messages in topicmap.items(): for index, index_messages in indexmap.items():
batch = producer.create_batch() for message in index_messages:
for body in messages: result = await client.index(index=index, body=message)
metadata = batch.append(key=None, value=body, timestamp=msg["ts"]) if not result["result"] == "created":
if metadata is None: log.error(f"Indexing failed: {result}")
partitions = await producer.partitions_for(topic) log.debug(f"Indexed {len(data)} messages in ES")
partition = random.choice(tuple(partitions))
await producer.send_batch(batch, topic, partition=partition)
# log.debug(
# (
# f"{batch.record_count()} messages sent to topic "
# f"{topic} partition {partition}"
# )
# )
batch = producer.create_batch()
continue
partitions = await producer.partitions_for(topic)
partition = random.choice(tuple(partitions))
await producer.send_batch(batch, topic, partition=partition)
# log.debug(
# (
# f"{batch.record_count()} messages sent to topic "
# f"{topic} partition {partition}"
# )
# )
log.debug(
"Kafka batches sent: "
+ ", ".join([tpc + ": " + str(len(topicmap[tpc])) for tpc in topicmap])
)
await producer.stop()
async def queue_message(msg): async def queue_message(msg):

View File

@ -9,6 +9,10 @@ services:
- ${PORTAINER_GIT_DIR}:/code - ${PORTAINER_GIT_DIR}:/code
env_file: env_file:
- ../stack.env - ../stack.env
networks:
- default
- pathogen
- elastic
threshold: threshold:
image: pathogen/threshold:latest image: pathogen/threshold:latest
@ -30,6 +34,8 @@ services:
# for development # for development
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
networks:
- default
ssdb: ssdb:
image: tsl0922/ssdb image: tsl0922/ssdb
@ -38,6 +44,8 @@ services:
- "1289:1289" - "1289:1289"
environment: environment:
- SSDB_PORT=1289 - SSDB_PORT=1289
networks:
- default
tmp: tmp:
image: busybox image: busybox
@ -68,8 +76,11 @@ services:
networks: networks:
default: default:
external: driver: bridge
name: pathogen pathogen:
external: true
elastic:
external: true
volumes: volumes:
redis_data: redis_data:

View File

@ -135,7 +135,7 @@ async def spawn_processing_threads(chunk, length):
f"{cores} threads: {len(flat_list)}" f"{cores} threads: {len(flat_list)}"
) )
) )
await db.store_kafka_batch(flat_list) await db.store_batch(flat_list)
# log.debug(f"Finished processing {len_data} messages") # log.debug(f"Finished processing {len_data} messages")

View File

@ -8,7 +8,7 @@ python-dotenv
#manticoresearch #manticoresearch
numpy numpy
aioredis[hiredis] aioredis[hiredis]
aiokafka #aiokafka
vaderSentiment vaderSentiment
polyglot polyglot
pyicu pyicu
@ -22,3 +22,4 @@ python-Levenshtein
orjson orjson
uvloop uvloop
numba numba
elasticsearch[async]