monolith/db.py

from math import ceil

import manticoresearch
import ujson
from manticoresearch.rest import ApiException
from numpy import array_split
from redis import StrictRedis

import util
from schemas.mc_s import schema

configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")
api_client = manticoresearch.ApiClient(configuration)
api_instance = manticoresearch.IndexApi(api_client)

log = util.get_logger("db")
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)


def store_message(msg):
    """
    Store a message into Manticore
    :param msg: dict
    """
    print("DISCORD MSGLEN", len(msg["msg"]))
    # normalise fields
    for key, value in list(msg.items()):
        if value is None:
            del msg[key]
        if key in schema:
            if isinstance(value, int):
                if schema[key].startswith("string"):
                    msg[key] = str(value)

    body = [{"insert": {"index": "main", "doc": msg}}]
    body_post = ""
    for item in body:
        body_post += ujson.dumps(item)
        body_post += "\n"

    # print(body_post)
    try:
        # Bulk index operations
        api_instance.bulk(body_post, async_req=True)
        # print(api_response)
    except ApiException as e:
        print("Exception when calling IndexApi->bulk: %s\n" % e)


def store_message_bulk(data):
    """
    Store a message into Manticore
    :param msg: dict
    """
    print("BULK", len(data))
    if not data:
        return
    split_posts = array_split(data, ceil(len(data) / 10000))
    for messages in split_posts:
        print("PROCESSING SPLIT OF", len(messages), "MESSAGES")
        total = []
        for msg in messages:
            # normalise fields
            for key, value in list(msg.items()):
                if value is None:
                    del msg[key]
                if key in schema:
                    if isinstance(value, int):
                        if schema[key].startswith("string"):
                            msg[key] = str(value)

            body = {"insert": {"index": "main", "doc": msg}}
            total.append(body)

        body_post = ""
        for item in total:
            body_post += ujson.dumps(item)
            body_post += "\n"

        # print(body_post)
        try:
            # Bulk index operations
            api_instance.bulk(body_post, async_req=True)
            # print(api_response)
        except ApiException as e:
            print("Exception when calling IndexApi->bulk: %s\n" % e)
        print("FINISHED PROCESSING SPLIT")

    print("BULK FINISH")


def update_schema():
    pass


def create_index(api_client):
    util_instance = manticoresearch.UtilsApi(api_client)
    schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])

    create_query = f"create table if not exists main({schema_types}) engine='columnar'"
    print("Schema types", create_query)
    util_instance.sql(create_query)


create_index(api_client)
update_schema()
Reformat code 2022-09-04 20:40:04 +00:00			`from math import ceil`

Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`import manticoresearch`
Reformat code 2022-09-04 20:40:04 +00:00			`import ujson`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`from manticoresearch.rest import ApiException`
Reformat code 2022-09-04 20:40:04 +00:00			`from numpy import array_split`
Implement running Discord and 4chan gathering simultaneously 2022-09-02 21:30:45 +00:00			`from redis import StrictRedis`

			`import util`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`from schemas.mc_s import schema`
Run processing in thread 2022-09-04 20:29:00 +00:00
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")`
			`api_client = manticoresearch.ApiClient(configuration)`
			`api_instance = manticoresearch.IndexApi(api_client)`
Implement running Discord and 4chan gathering simultaneously 2022-09-02 21:30:45 +00:00
			`log = util.get_logger("db")`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)`
Implement running Discord and 4chan gathering simultaneously 2022-09-02 21:30:45 +00:00
Reformat code 2022-09-04 20:40:04 +00:00
Implement running Discord and 4chan gathering simultaneously 2022-09-02 21:30:45 +00:00			`def store_message(msg):`
			`"""`
			`Store a message into Manticore`
			`:param msg: dict`
			`"""`
Implement aiohttp 2022-09-04 18:44:25 +00:00			`print("DISCORD MSGLEN", len(msg["msg"]))`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`# normalise fields`
			`for key, value in list(msg.items()):`
			`if value is None:`
			`del msg[key]`
			`if key in schema:`
			`if isinstance(value, int):`
			`if schema[key].startswith("string"):`
			`msg[key] = str(value)`
Implement running Discord and 4chan gathering simultaneously 2022-09-02 21:30:45 +00:00
Reformat code 2022-09-04 20:40:04 +00:00			`body = [{"insert": {"index": "main", "doc": msg}}]`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`body_post = ""`
			`for item in body:`
Run processing in thread 2022-09-04 20:29:00 +00:00			`body_post += ujson.dumps(item)`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`body_post += "\n"`

Reformat code 2022-09-04 20:40:04 +00:00			`# print(body_post)`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`try:`
			`# Bulk index operations`
Reformat code 2022-09-04 20:40:04 +00:00			`api_instance.bulk(body_post, async_req=True)`
			`# print(api_response)`
Implement aiohttp 2022-09-04 18:44:25 +00:00			`except ApiException as e:`
			`print("Exception when calling IndexApi->bulk: %s\n" % e)`

Reformat code 2022-09-04 20:40:04 +00:00
Run processing in thread 2022-09-04 20:29:00 +00:00			`def store_message_bulk(data):`
Implement aiohttp 2022-09-04 18:44:25 +00:00			`"""`
			`Store a message into Manticore`
			`:param msg: dict`
			`"""`
Run processing in thread 2022-09-04 20:29:00 +00:00			`print("BULK", len(data))`
Split thread list into chunks to save memory 2022-09-05 06:20:30 +00:00			`if not data:`
			`return`
Run processing in thread 2022-09-04 20:29:00 +00:00			`split_posts = array_split(data, ceil(len(data) / 10000))`
			`for messages in split_posts:`
			`print("PROCESSING SPLIT OF", len(messages), "MESSAGES")`
			`total = []`
			`for msg in messages:`
			`# normalise fields`
			`for key, value in list(msg.items()):`
			`if value is None:`
			`del msg[key]`
			`if key in schema:`
			`if isinstance(value, int):`
			`if schema[key].startswith("string"):`
			`msg[key] = str(value)`
Implement aiohttp 2022-09-04 18:44:25 +00:00
Reformat code 2022-09-04 20:40:04 +00:00			`body = {"insert": {"index": "main", "doc": msg}}`
Run processing in thread 2022-09-04 20:29:00 +00:00			`total.append(body)`
Reformat code 2022-09-04 20:40:04 +00:00
Run processing in thread 2022-09-04 20:29:00 +00:00			`body_post = ""`
			`for item in total:`
			`body_post += ujson.dumps(item)`
			`body_post += "\n"`
Implement aiohttp 2022-09-04 18:44:25 +00:00
Reformat code 2022-09-04 20:40:04 +00:00			`# print(body_post)`
Run processing in thread 2022-09-04 20:29:00 +00:00			`try:`
			`# Bulk index operations`
Reformat code 2022-09-04 20:40:04 +00:00			`api_instance.bulk(body_post, async_req=True)`
			`# print(api_response)`
Run processing in thread 2022-09-04 20:29:00 +00:00			`except ApiException as e:`
			`print("Exception when calling IndexApi->bulk: %s\n" % e)`
			`print("FINISHED PROCESSING SPLIT")`

			`print("BULK FINISH")`
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00
Reformat code 2022-09-04 20:40:04 +00:00
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`def update_schema():`
			`pass`

Reformat code 2022-09-04 20:40:04 +00:00
Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`def create_index(api_client):`
			`util_instance = manticoresearch.UtilsApi(api_client)`
Reformat code 2022-09-04 20:40:04 +00:00			`schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])`

Begin implementing aiohttp 2022-09-04 12:47:32 +00:00			`create_query = f"create table if not exists main({schema_types}) engine='columnar'"`
			`print("Schema types", create_query)`
			`util_instance.sql(create_query)`


			`create_index(api_client)`
			`update_schema()`