monolith/db.py

109 lines
3.0 KiB
Python
Raw Normal View History

2022-09-04 20:40:04 +00:00
from math import ceil
2022-09-04 12:47:32 +00:00
import manticoresearch
2022-09-04 20:40:04 +00:00
import ujson
2022-09-04 12:47:32 +00:00
from manticoresearch.rest import ApiException
2022-09-04 20:40:04 +00:00
from numpy import array_split
from redis import StrictRedis
import util
2022-09-04 12:47:32 +00:00
from schemas.mc_s import schema
2022-09-04 20:29:00 +00:00
2022-09-04 12:47:32 +00:00
configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")
api_client = manticoresearch.ApiClient(configuration)
api_instance = manticoresearch.IndexApi(api_client)
log = util.get_logger("db")
2022-09-04 12:47:32 +00:00
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)
2022-09-04 20:40:04 +00:00
def store_message(msg):
"""
Store a message into Manticore
:param msg: dict
"""
2022-09-04 18:44:25 +00:00
print("DISCORD MSGLEN", len(msg["msg"]))
2022-09-04 12:47:32 +00:00
# normalise fields
for key, value in list(msg.items()):
if value is None:
del msg[key]
if key in schema:
if isinstance(value, int):
if schema[key].startswith("string"):
msg[key] = str(value)
2022-09-04 20:40:04 +00:00
body = [{"insert": {"index": "main", "doc": msg}}]
2022-09-04 12:47:32 +00:00
body_post = ""
for item in body:
2022-09-04 20:29:00 +00:00
body_post += ujson.dumps(item)
2022-09-04 12:47:32 +00:00
body_post += "\n"
2022-09-04 20:40:04 +00:00
# print(body_post)
2022-09-04 12:47:32 +00:00
try:
# Bulk index operations
api_response = api_instance.bulk(body_post) # , async_req=True
print(api_response)
2022-09-04 18:44:25 +00:00
except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e)
2022-09-04 20:40:04 +00:00
2022-09-04 20:29:00 +00:00
def store_message_bulk(data):
2022-09-04 18:44:25 +00:00
"""
Store a message into Manticore
:param msg: dict
"""
2022-09-04 20:29:00 +00:00
print("BULK", len(data))
if not data:
return
# 10000: maximum inserts we can submit to
# Manticore as of Sept 2022
2022-09-04 20:29:00 +00:00
split_posts = array_split(data, ceil(len(data) / 10000))
for messages in split_posts:
print("PROCESSING SPLIT OF", len(messages), "MESSAGES")
total = []
for msg in messages:
# normalise fields
for key, value in list(msg.items()):
if value is None:
del msg[key]
if key in schema:
if isinstance(value, int):
if schema[key].startswith("string"):
msg[key] = str(value)
2022-09-04 18:44:25 +00:00
2022-09-04 20:40:04 +00:00
body = {"insert": {"index": "main", "doc": msg}}
2022-09-04 20:29:00 +00:00
total.append(body)
2022-09-04 20:40:04 +00:00
2022-09-04 20:29:00 +00:00
body_post = ""
for item in total:
body_post += ujson.dumps(item)
body_post += "\n"
2022-09-04 18:44:25 +00:00
2022-09-04 20:40:04 +00:00
# print(body_post)
2022-09-04 20:29:00 +00:00
try:
# Bulk index operations
api_response = api_instance.bulk(body_post) # , async_req=True
print(api_response)
2022-09-04 20:29:00 +00:00
except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e)
print("FINISHED PROCESSING SPLIT")
print("BULK FINISH")
2022-09-04 12:47:32 +00:00
2022-09-04 20:40:04 +00:00
2022-09-04 12:47:32 +00:00
def update_schema():
pass
2022-09-04 20:40:04 +00:00
2022-09-04 12:47:32 +00:00
def create_index(api_client):
util_instance = manticoresearch.UtilsApi(api_client)
2022-09-04 20:40:04 +00:00
schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])
2022-09-04 12:47:32 +00:00
create_query = f"create table if not exists main({schema_types}) engine='columnar'"
print("Schema types", create_query)
util_instance.sql(create_query)
create_index(api_client)
update_schema()