monolith/db.py

from math import ceil

import aioredis
import manticoresearch
import ujson
from manticoresearch.rest import ApiException
from numpy import array_split
from redis import StrictRedis

import util
from schemas import mc_s

configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")
api_client = manticoresearch.ApiClient(configuration)
api_instance = manticoresearch.IndexApi(api_client)

log = util.get_logger("db")

# Redis (legacy)
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)

# AIORedis
ar = aioredis.from_url("unix:///var/run/redis/redis.sock", db=0)

TYPES_MAIN = [
    "msg",
    "notice",
    "action",
    "part",
    "join",
    "kick",
    "quit",
    "nick",
    "mode",
    "topic",
]
TYPES_META = ["who"]
TYPES_INT = ["conn", "highlight", "znc", "query", "self"]


def store_message(msg):
    """
    Store a message into Manticore
    :param msg: dict
    """
    # Duplicated to avoid extra function call
    if msg["type"] in TYPES_MAIN:
        index = "main"
        schema = mc_s.schema_main
    elif msg["type"] in TYPES_META:
        index = "meta"
        schema = mc_s.schema_meta
    elif msg["type"] in TYPES_INT:
        index = "internal"
        schema = mc_s.schema_int
    # normalise fields
    for key, value in list(msg.items()):
        if value is None:
            del msg[key]
        if key in schema:
            if isinstance(value, int):
                if schema[key].startswith("string"):
                    msg[key] = str(value)

    body = [{"insert": {"index": index, "doc": msg}}]
    body_post = ""
    for item in body:
        body_post += ujson.dumps(item)
        body_post += "\n"

    # print(body_post)
    try:
        # Bulk index operations
        api_response = api_instance.bulk(body_post)  # , async_req=True
        # print(api_response)
    except ApiException as e:
        print("Exception when calling IndexApi->bulk: %s\n" % e)
        print("ATTEMPT", body_post)


def store_message_bulk(data):
    """
    Store a message into Manticore
    :param msg: dict
    """
    if not data:
        return
    # 10000: maximum inserts we can submit to
    # Manticore as of Sept 2022
    split_posts = array_split(data, ceil(len(data) / 10000))
    for messages in split_posts:
        total = []
        for msg in messages:
            # Duplicated to avoid extra function call (see above)
            if msg["type"] in TYPES_MAIN:
                index = "main"
                schema = mc_s.schema_main
            elif msg["type"] in TYPES_META:
                index = "meta"
                schema = mc_s.schema_meta
            elif msg["type"] in TYPES_INT:
                index = "internal"
                schema = mc_s.schema_int
            # normalise fields
            for key, value in list(msg.items()):
                if value is None:
                    del msg[key]
                if key in schema:
                    if isinstance(value, int):
                        if schema[key].startswith("string"):
                            msg[key] = str(value)

            body = {"insert": {"index": index, "doc": msg}}
            total.append(body)

        body_post = ""
        for item in total:
            body_post += ujson.dumps(item)
            body_post += "\n"

        # print(body_post)
        try:
            # Bulk index operations
            api_response = api_instance.bulk(body_post)  # , async_req=True
            print(api_response)
        except ApiException as e:
            print("Exception when calling IndexApi->bulk: %s\n" % e)
            print("ATTEMPT", body_post)


def update_schema():
    pass


def create_index(api_client):
    util_instance = manticoresearch.UtilsApi(api_client)
    schemas = {
        "main": mc_s.schema_main,
        "meta": mc_s.schema_meta,
        "internal": mc_s.schema_int,
    }
    for name, schema in schemas.items():
        schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])

        create_query = (
            f"create table if not exists {name}({schema_types}) engine='columnar'"
        )
        print("Schema types", create_query)
        util_instance.sql(create_query)


create_index(api_client)
update_schema()
Reformat code 2 years ago			`from math import ceil`

Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`import aioredis`
Begin implementing aiohttp 2 years ago			`import manticoresearch`
Reformat code 2 years ago			`import ujson`
Begin implementing aiohttp 2 years ago			`from manticoresearch.rest import ApiException`
Reformat code 2 years ago			`from numpy import array_split`
Implement running Discord and 4chan gathering simultaneously 2 years ago			`from redis import StrictRedis`

			`import util`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`from schemas import mc_s`

Begin implementing aiohttp 2 years ago			`configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")`
			`api_client = manticoresearch.ApiClient(configuration)`
			`api_instance = manticoresearch.IndexApi(api_client)`
Implement running Discord and 4chan gathering simultaneously 2 years ago
			`log = util.get_logger("db")`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago
			`# Redis (legacy)`
Begin implementing aiohttp 2 years ago			`r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago
			`# AIORedis`
			`ar = aioredis.from_url("unix:///var/run/redis/redis.sock", db=0)`

			`TYPES_MAIN = [`
			`"msg",`
			`"notice",`
			`"action",`
			`"part",`
			`"join",`
			`"kick",`
			`"quit",`
			`"nick",`
			`"mode",`
			`"topic",`
			`]`
			`TYPES_META = ["who"]`
			`TYPES_INT = ["conn", "highlight", "znc", "query", "self"]`

Reformat code 2 years ago
Implement running Discord and 4chan gathering simultaneously 2 years ago			`def store_message(msg):`
			`"""`
			`Store a message into Manticore`
			`:param msg: dict`
			`"""`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`# Duplicated to avoid extra function call`
			`if msg["type"] in TYPES_MAIN:`
			`index = "main"`
			`schema = mc_s.schema_main`
			`elif msg["type"] in TYPES_META:`
			`index = "meta"`
			`schema = mc_s.schema_meta`
			`elif msg["type"] in TYPES_INT:`
			`index = "internal"`
			`schema = mc_s.schema_int`
Begin implementing aiohttp 2 years ago			`# normalise fields`
			`for key, value in list(msg.items()):`
			`if value is None:`
			`del msg[key]`
			`if key in schema:`
			`if isinstance(value, int):`
			`if schema[key].startswith("string"):`
			`msg[key] = str(value)`
Implement running Discord and 4chan gathering simultaneously 2 years ago
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`body = [{"insert": {"index": index, "doc": msg}}]`
Begin implementing aiohttp 2 years ago			`body_post = ""`
			`for item in body:`
Run processing in thread 2 years ago			`body_post += ujson.dumps(item)`
Begin implementing aiohttp 2 years ago			`body_post += "\n"`

Reformat code 2 years ago			`# print(body_post)`
Begin implementing aiohttp 2 years ago			`try:`
			`# Bulk index operations`
Reformat and set the net and channel for 4chan 2 years ago			`api_response = api_instance.bulk(body_post) # , async_req=True`
Implement ingesting to Redis from Threshold 2 years ago			`# print(api_response)`
Implement aiohttp 2 years ago			`except ApiException as e:`
			`print("Exception when calling IndexApi->bulk: %s\n" % e)`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`print("ATTEMPT", body_post)`
Implement aiohttp 2 years ago
Reformat code 2 years ago
Run processing in thread 2 years ago			`def store_message_bulk(data):`
Implement aiohttp 2 years ago			`"""`
			`Store a message into Manticore`
			`:param msg: dict`
			`"""`
Split thread list into chunks to save memory 2 years ago			`if not data:`
			`return`
Make crawler more efficient and implement configurable parameters 2 years ago			`# 10000: maximum inserts we can submit to`
			`# Manticore as of Sept 2022`
Run processing in thread 2 years ago			`split_posts = array_split(data, ceil(len(data) / 10000))`
			`for messages in split_posts:`
			`total = []`
			`for msg in messages:`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`# Duplicated to avoid extra function call (see above)`
			`if msg["type"] in TYPES_MAIN:`
			`index = "main"`
			`schema = mc_s.schema_main`
			`elif msg["type"] in TYPES_META:`
			`index = "meta"`
			`schema = mc_s.schema_meta`
			`elif msg["type"] in TYPES_INT:`
			`index = "internal"`
			`schema = mc_s.schema_int`
Run processing in thread 2 years ago			`# normalise fields`
			`for key, value in list(msg.items()):`
			`if value is None:`
			`del msg[key]`
			`if key in schema:`
			`if isinstance(value, int):`
			`if schema[key].startswith("string"):`
			`msg[key] = str(value)`
Implement aiohttp 2 years ago
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`body = {"insert": {"index": index, "doc": msg}}`
Run processing in thread 2 years ago			`total.append(body)`
Reformat code 2 years ago
Run processing in thread 2 years ago			`body_post = ""`
			`for item in total:`
			`body_post += ujson.dumps(item)`
			`body_post += "\n"`
Implement aiohttp 2 years ago
Reformat code 2 years ago			`# print(body_post)`
Run processing in thread 2 years ago			`try:`
			`# Bulk index operations`
Reformat and set the net and channel for 4chan 2 years ago			`api_response = api_instance.bulk(body_post) # , async_req=True`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`print(api_response)`
Run processing in thread 2 years ago			`except ApiException as e:`
			`print("Exception when calling IndexApi->bulk: %s\n" % e)`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`print("ATTEMPT", body_post)`
Begin implementing aiohttp 2 years ago
Reformat code 2 years ago
Begin implementing aiohttp 2 years ago			`def update_schema():`
			`pass`

Reformat code 2 years ago
Begin implementing aiohttp 2 years ago			`def create_index(api_client):`
			`util_instance = manticoresearch.UtilsApi(api_client)`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`schemas = {`
			`"main": mc_s.schema_main,`
			`"meta": mc_s.schema_meta,`
			`"internal": mc_s.schema_int,`
			`}`
			`for name, schema in schemas.items():`
			`schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])`

			`create_query = (`
			`f"create table if not exists {name}({schema_types}) engine='columnar'"`
			`)`
			`print("Schema types", create_query)`
			`util_instance.sql(create_query)`
Begin implementing aiohttp 2 years ago

			`create_index(api_client)`
			`update_schema()`