Fully implement Elasticsearch indexing
This commit is contained in:
parent
052631c71f
commit
49f46c33ba
111
db.py
111
db.py
|
@ -1,20 +1,17 @@
|
||||||
import random
|
|
||||||
from os import getenv
|
from os import getenv
|
||||||
|
|
||||||
import aioredis
|
import aioredis
|
||||||
import orjson
|
import orjson
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
# Kafka
|
# Elasticsearch
|
||||||
from aiokafka import AIOKafkaProducer
|
from elasticsearch import AsyncElasticsearch
|
||||||
|
|
||||||
import util
|
import util
|
||||||
|
|
||||||
trues = ("true", "1", "t", True)
|
trues = ("true", "1", "t", True)
|
||||||
|
|
||||||
MONOLITH_KAFKA_ENABLED = getenv("MONOLITH_KAFKA_ENABLED", "false").lower() in trues
|
# INDEX = "msg"
|
||||||
|
|
||||||
# KAFKA_TOPIC = "msg"
|
|
||||||
|
|
||||||
log = util.get_logger("db")
|
log = util.get_logger("db")
|
||||||
|
|
||||||
|
@ -47,15 +44,54 @@ TYPES_META = ["who"]
|
||||||
TYPES_INT = ["conn", "highlight", "znc", "query", "self"]
|
TYPES_INT = ["conn", "highlight", "znc", "query", "self"]
|
||||||
KEYNAME = "queue"
|
KEYNAME = "queue"
|
||||||
|
|
||||||
|
ELASTICSEARCH_USERNAME = getenv("ELASTICSEARCH_USERNAME", "elastic")
|
||||||
|
ELASTICSEARCH_PASSWORD = getenv("ELASTICSEARCH_PASSWORD", "changeme")
|
||||||
|
ELASTICSEARCH_HOST = getenv("ELASTICSEARCH_HOST", "localhost")
|
||||||
|
ELASTICSEARCH_PORT = int(getenv("ELASTICSEARCH_PORT", "9200"))
|
||||||
|
ELASTICSEARCH_TLS = getenv("ELASTICSEARCH_TLS", "false") in trues
|
||||||
|
|
||||||
async def store_kafka_batch(data):
|
client = None
|
||||||
if not MONOLITH_KAFKA_ENABLED:
|
|
||||||
log.info(f"Not storing Kafka batch of length {len(data)}, Kafka is disabled.")
|
# These are sometimes numeric, sometimes strings.
|
||||||
return
|
# If they are seen to be numeric first, ES will erroneously
|
||||||
# log.debug(f"Storing Kafka batch of {len(data)} messages")
|
# index them as "long" and then subsequently fail to index messages
|
||||||
producer = AIOKafkaProducer(bootstrap_servers="kafka:9092")
|
# with strings in the field.
|
||||||
await producer.start()
|
keyword_fields = ["nick_id", "user_id", "net_id"]
|
||||||
topicmap = {}
|
|
||||||
|
mapping = {
|
||||||
|
"mappings": {
|
||||||
|
"properties": {
|
||||||
|
"ts": {"type": "date", "format": "epoch_second"},
|
||||||
|
"file_tim": {"type": "date", "format": "epoch_millis"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for field in keyword_fields:
|
||||||
|
mapping["mappings"]["properties"][field] = {"type": "text"}
|
||||||
|
|
||||||
|
|
||||||
|
async def initialise_elasticsearch():
|
||||||
|
"""
|
||||||
|
Initialise the Elasticsearch client.
|
||||||
|
"""
|
||||||
|
auth = (ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD)
|
||||||
|
client = AsyncElasticsearch(ELASTICSEARCH_HOST, http_auth=auth, verify_certs=False)
|
||||||
|
for index in ("main", "restricted"):
|
||||||
|
if await client.indices.exists(index=index):
|
||||||
|
# update index with mapping
|
||||||
|
await client.indices.put_mapping(
|
||||||
|
index=index, properties=mapping["mappings"]["properties"]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await client.indices.create(index=index, mappings=mapping["mappings"])
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
async def store_batch(data):
|
||||||
|
global client
|
||||||
|
if not client:
|
||||||
|
client = await initialise_elasticsearch()
|
||||||
|
indexmap = {}
|
||||||
for msg in data:
|
for msg in data:
|
||||||
if msg["type"] in TYPES_MAIN:
|
if msg["type"] in TYPES_MAIN:
|
||||||
# index = "main"
|
# index = "main"
|
||||||
|
@ -68,7 +104,7 @@ async def store_kafka_batch(data):
|
||||||
index = "internal"
|
index = "internal"
|
||||||
# schema = mc_s.schema_int
|
# schema = mc_s.schema_int
|
||||||
|
|
||||||
KAFKA_TOPIC = index
|
INDEX = index
|
||||||
|
|
||||||
# if key in schema:
|
# if key in schema:
|
||||||
# if isinstance(value, int):
|
# if isinstance(value, int):
|
||||||
|
@ -76,45 +112,20 @@ async def store_kafka_batch(data):
|
||||||
# "text"
|
# "text"
|
||||||
# ):
|
# ):
|
||||||
# msg[key] = str(value)
|
# msg[key] = str(value)
|
||||||
body = orjson.dumps(msg)
|
# body = orjson.dumps(msg)
|
||||||
if "ts" not in msg:
|
if "ts" not in msg:
|
||||||
raise Exception("No TS in msg")
|
raise Exception("No TS in msg")
|
||||||
if KAFKA_TOPIC not in topicmap:
|
if INDEX not in indexmap:
|
||||||
topicmap[KAFKA_TOPIC] = [body]
|
indexmap[INDEX] = [msg]
|
||||||
else:
|
else:
|
||||||
topicmap[KAFKA_TOPIC].append(body)
|
indexmap[INDEX].append(msg)
|
||||||
|
|
||||||
for topic, messages in topicmap.items():
|
for index, index_messages in indexmap.items():
|
||||||
batch = producer.create_batch()
|
for message in index_messages:
|
||||||
for body in messages:
|
result = await client.index(index=index, body=message)
|
||||||
metadata = batch.append(key=None, value=body, timestamp=msg["ts"])
|
if not result["result"] == "created":
|
||||||
if metadata is None:
|
log.error(f"Indexing failed: {result}")
|
||||||
partitions = await producer.partitions_for(topic)
|
log.debug(f"Indexed {len(data)} messages in ES")
|
||||||
partition = random.choice(tuple(partitions))
|
|
||||||
await producer.send_batch(batch, topic, partition=partition)
|
|
||||||
# log.debug(
|
|
||||||
# (
|
|
||||||
# f"{batch.record_count()} messages sent to topic "
|
|
||||||
# f"{topic} partition {partition}"
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
batch = producer.create_batch()
|
|
||||||
continue
|
|
||||||
|
|
||||||
partitions = await producer.partitions_for(topic)
|
|
||||||
partition = random.choice(tuple(partitions))
|
|
||||||
await producer.send_batch(batch, topic, partition=partition)
|
|
||||||
# log.debug(
|
|
||||||
# (
|
|
||||||
# f"{batch.record_count()} messages sent to topic "
|
|
||||||
# f"{topic} partition {partition}"
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
log.debug(
|
|
||||||
"Kafka batches sent: "
|
|
||||||
+ ", ".join([tpc + ": " + str(len(topicmap[tpc])) for tpc in topicmap])
|
|
||||||
)
|
|
||||||
await producer.stop()
|
|
||||||
|
|
||||||
|
|
||||||
async def queue_message(msg):
|
async def queue_message(msg):
|
||||||
|
|
|
@ -9,6 +9,10 @@ services:
|
||||||
- ${PORTAINER_GIT_DIR}:/code
|
- ${PORTAINER_GIT_DIR}:/code
|
||||||
env_file:
|
env_file:
|
||||||
- ../stack.env
|
- ../stack.env
|
||||||
|
networks:
|
||||||
|
- default
|
||||||
|
- pathogen
|
||||||
|
- elastic
|
||||||
|
|
||||||
threshold:
|
threshold:
|
||||||
image: pathogen/threshold:latest
|
image: pathogen/threshold:latest
|
||||||
|
@ -30,6 +34,8 @@ services:
|
||||||
# for development
|
# for development
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
|
networks:
|
||||||
|
- default
|
||||||
|
|
||||||
ssdb:
|
ssdb:
|
||||||
image: tsl0922/ssdb
|
image: tsl0922/ssdb
|
||||||
|
@ -38,6 +44,8 @@ services:
|
||||||
- "1289:1289"
|
- "1289:1289"
|
||||||
environment:
|
environment:
|
||||||
- SSDB_PORT=1289
|
- SSDB_PORT=1289
|
||||||
|
networks:
|
||||||
|
- default
|
||||||
|
|
||||||
tmp:
|
tmp:
|
||||||
image: busybox
|
image: busybox
|
||||||
|
@ -67,9 +75,12 @@ services:
|
||||||
retries: 15
|
retries: 15
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
default:
|
default:
|
||||||
external:
|
driver: bridge
|
||||||
name: pathogen
|
pathogen:
|
||||||
|
external: true
|
||||||
|
elastic:
|
||||||
|
external: true
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
redis_data:
|
redis_data:
|
||||||
|
|
|
@ -135,7 +135,7 @@ async def spawn_processing_threads(chunk, length):
|
||||||
f"{cores} threads: {len(flat_list)}"
|
f"{cores} threads: {len(flat_list)}"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
await db.store_kafka_batch(flat_list)
|
await db.store_batch(flat_list)
|
||||||
|
|
||||||
# log.debug(f"Finished processing {len_data} messages")
|
# log.debug(f"Finished processing {len_data} messages")
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ python-dotenv
|
||||||
#manticoresearch
|
#manticoresearch
|
||||||
numpy
|
numpy
|
||||||
aioredis[hiredis]
|
aioredis[hiredis]
|
||||||
aiokafka
|
#aiokafka
|
||||||
vaderSentiment
|
vaderSentiment
|
||||||
polyglot
|
polyglot
|
||||||
pyicu
|
pyicu
|
||||||
|
@ -22,3 +22,4 @@ python-Levenshtein
|
||||||
orjson
|
orjson
|
||||||
uvloop
|
uvloop
|
||||||
numba
|
numba
|
||||||
|
elasticsearch[async]
|
||||||
|
|
Loading…
Reference in New Issue