monolith/db.py

import random
from math import ceil

import aioredis
import manticoresearch
import ujson
from aiokafka import AIOKafkaProducer
from manticoresearch.rest import ApiException
from numpy import array_split
from redis import StrictRedis

import util

# Manticore schema
from schemas import mc_s

# Manticore
configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")
api_client = manticoresearch.ApiClient(configuration)
api_instance = manticoresearch.IndexApi(api_client)

# Kafka
from aiokafka import AIOKafkaProducer

KAFKA_TOPIC = "msg"

log = util.get_logger("db")

# Redis (legacy)
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)

# AIORedis
ar = aioredis.from_url("unix:///var/run/redis/redis.sock", db=0)

TYPES_MAIN = [
    "msg",
    "notice",
    "action",
    "part",
    "join",
    "kick",
    "quit",
    "nick",
    "mode",
    "topic",
    "update",
]
TYPES_META = ["who"]
TYPES_INT = ["conn", "highlight", "znc", "query", "self"]
KEYPREFIX = "queue."


async def store_kafka_batch(data):
    print("STORING KAFKA BATCH")
    producer = AIOKafkaProducer(bootstrap_servers="kafka:9092")
    await producer.start()
    batch = producer.create_batch()
    for msg in data:
        if msg["type"] in TYPES_MAIN:
            index = "main"
            schema = mc_s.schema_main
        elif msg["type"] in TYPES_META:
            index = "meta"
            schema = mc_s.schema_meta
        elif msg["type"] in TYPES_INT:
            index = "internal"
            schema = mc_s.schema_int
        # normalise fields
        for key, value in list(msg.items()):
            if value is None:
                del msg[key]
            if key in schema:
                if isinstance(value, int):
                    if schema[key].startswith("string") or schema[key].startswith(
                        "text"
                    ):
                        msg[key] = str(value)
        message = ujson.dumps(msg)
        body = str.encode(message)
        if "ts" not in msg:
            # print("MSG WITHOUT TS", msg)
            continue
        metadata = batch.append(key=None, value=body, timestamp=msg["ts"])
        if metadata is None:
            partitions = await producer.partitions_for(KAFKA_TOPIC)
            partition = random.choice(tuple(partitions))
            await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
            print(
                "%d messages sent to partition %d" % (batch.record_count(), partition)
            )
            batch = producer.create_batch()
            continue

    partitions = await producer.partitions_for(KAFKA_TOPIC)
    partition = random.choice(tuple(partitions))
    await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)
    print("%d messages sent to partition %d" % (batch.record_count(), partition))
    await producer.stop()


# def store_message(msg):
#     """
#     Store a message into Manticore
#     :param msg: dict
#     """
#     store_kafka(msg)
# # Duplicated to avoid extra function call
# if msg["type"] in TYPES_MAIN:
#     index = "main"
#     schema = mc_s.schema_main
# elif msg["type"] in TYPES_META:
#     index = "meta"
#     schema = mc_s.schema_meta
# elif msg["type"] in TYPES_INT:
#     index = "internal"
#     schema = mc_s.schema_int
# # normalise fields
# for key, value in list(msg.items()):
#     if value is None:
#         del msg[key]
#     if key in schema:
#         if isinstance(value, int):
#             if schema[key].startswith("string") or schema[key].startswith("text"):
#                 msg[key] = str(value)

# body = [{"insert": {"index": index, "doc": msg}}]
# body_post = ""
# for item in body:
#     body_post += ujson.dumps(item)
#     body_post += "\n"

# # print(body_post)
# try:
#     # Bulk index operations
#     print("FAKE POST")
#     #api_response = api_instance.bulk(body_post)  # , async_req=True
#     # print(api_response)
# except ApiException as e:
#     print("Exception when calling IndexApi->bulk: %s\n" % e)
#     print("ATTEMPT", body_post)


async def queue_message(msg):
    """
    Queue a message on the Redis buffer.
    """
    src = msg["src"]
    message = ujson.dumps(msg)

    key = f"{KEYPREFIX}{src}"
    await ar.sadd(key, message)


async def queue_message_bulk(data):
    """
    Queue multiple messages on the Redis buffer.
    """
    for msg in data:
        src = msg["src"]
        message = ujson.dumps(msg)

        key = f"{KEYPREFIX}{src}"
        await ar.sadd(key, message)


# For now, make a normal function until we go full async
def queue_message_bulk_sync(data):
    """
    Queue multiple messages on the Redis buffer.
    """
    for msg in data:
        src = msg["src"]
        message = ujson.dumps(msg)

        key = "{KEYPREFIX}{src}"
        r.sadd(key, message)


# def store_message_bulk(data):
#     """
#     Store a message into Manticore
#     :param msg: dict
#     """
#     if not data:
#         return
#     for msg in data:
#         store_kafka(msg)
# # 10000: maximum inserts we can submit to
# # Manticore as of Sept 2022
# split_posts = array_split(data, ceil(len(data) / 10000))
# for messages in split_posts:
#     total = []
#     for msg in messages:
#         # Duplicated to avoid extra function call (see above)
#         if msg["type"] in TYPES_MAIN:
#             index = "main"
#             schema = mc_s.schema_main
#         elif msg["type"] in TYPES_META:
#             index = "meta"
#             schema = mc_s.schema_meta
#         elif msg["type"] in TYPES_INT:
#             index = "internal"
#             schema = mc_s.schema_int
#         # normalise fields
#         for key, value in list(msg.items()):
#             if value is None:
#                 del msg[key]
#             if key in schema:
#                 if isinstance(value, int):
#                     if schema[key].startswith("string") or schema[key].startswith(
#                         "text"
#                     ):
#                         msg[key] = str(value)

#         body = {"insert": {"index": index, "doc": msg}}
#         total.append(body)

#     body_post = ""
#     for item in total:
#         body_post += ujson.dumps(item)
#         body_post += "\n"

#     # print(body_post)
#     try:
#         # Bulk index operations
#         print("FAKE POST")
#         #api_response = api_instance.bulk(body_post)  # , async_req=True
#         #print(api_response)
#     except ApiException as e:
#         print("Exception when calling IndexApi->bulk: %s\n" % e)
#         print("ATTEMPT", body_post)


# def update_schema():
#     pass


# def create_index(api_client):
#     util_instance = manticoresearch.UtilsApi(api_client)
#     schemas = {
#         "main": mc_s.schema_main,
#         "meta": mc_s.schema_meta,
#         "internal": mc_s.schema_int,
#     }
#     for name, schema in schemas.items():
#         schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])

#         create_query = (
#             f"create table if not exists {name}({schema_types}) engine='columnar'"
#         )
#         print("Schema types", create_query)
#         util_instance.sql(create_query)


# create_index(api_client)
# update_schema()
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`import random`
Reformat code 2 years ago			`from math import ceil`

Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`import aioredis`
Begin implementing aiohttp 2 years ago			`import manticoresearch`
Reformat code 2 years ago			`import ujson`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`from aiokafka import AIOKafkaProducer`
Begin implementing aiohttp 2 years ago			`from manticoresearch.rest import ApiException`
Reformat code 2 years ago			`from numpy import array_split`
Implement running Discord and 4chan gathering simultaneously 2 years ago			`from redis import StrictRedis`

			`import util`
Ingest into Kafka and queue messages better 2 years ago
			`# Manticore schema`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`from schemas import mc_s`

Ingest into Kafka and queue messages better 2 years ago			`# Manticore`
Begin implementing aiohttp 2 years ago			`configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")`
			`api_client = manticoresearch.ApiClient(configuration)`
			`api_instance = manticoresearch.IndexApi(api_client)`
Implement running Discord and 4chan gathering simultaneously 2 years ago
Ingest into Kafka and queue messages better 2 years ago			`# Kafka`
			`from aiokafka import AIOKafkaProducer`
Properly process Redis buffered messages and ingest into Kafka 2 years ago
Ingest into Kafka and queue messages better 2 years ago			`KAFKA_TOPIC = "msg"`

Implement running Discord and 4chan gathering simultaneously 2 years ago			`log = util.get_logger("db")`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago
			`# Redis (legacy)`
Begin implementing aiohttp 2 years ago			`r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago
			`# AIORedis`
			`ar = aioredis.from_url("unix:///var/run/redis/redis.sock", db=0)`

			`TYPES_MAIN = [`
			`"msg",`
			`"notice",`
			`"action",`
			`"part",`
			`"join",`
			`"kick",`
			`"quit",`
			`"nick",`
			`"mode",`
			`"topic",`
Add 4chan update message type to main types 2 years ago			`"update",`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago			`]`
			`TYPES_META = ["who"]`
			`TYPES_INT = ["conn", "highlight", "znc", "query", "self"]`
Ingest into Kafka and queue messages better 2 years ago			`KEYPREFIX = "queue."`


			`async def store_kafka_batch(data):`
			`print("STORING KAFKA BATCH")`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`producer = AIOKafkaProducer(bootstrap_servers="kafka:9092")`
Ingest into Kafka and queue messages better 2 years ago			`await producer.start()`
			`batch = producer.create_batch()`
			`for msg in data:`
			`if msg["type"] in TYPES_MAIN:`
			`index = "main"`
			`schema = mc_s.schema_main`
			`elif msg["type"] in TYPES_META:`
			`index = "meta"`
			`schema = mc_s.schema_meta`
			`elif msg["type"] in TYPES_INT:`
			`index = "internal"`
			`schema = mc_s.schema_int`
			`# normalise fields`
			`for key, value in list(msg.items()):`
			`if value is None:`
			`del msg[key]`
			`if key in schema:`
			`if isinstance(value, int):`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`if schema[key].startswith("string") or schema[key].startswith(`
			`"text"`
			`):`
Ingest into Kafka and queue messages better 2 years ago			`msg[key] = str(value)`
			`message = ujson.dumps(msg)`
			`body = str.encode(message)`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`if "ts" not in msg:`
			`# print("MSG WITHOUT TS", msg)`
			`continue`
Ingest into Kafka and queue messages better 2 years ago			`metadata = batch.append(key=None, value=body, timestamp=msg["ts"])`
			`if metadata is None:`
			`partitions = await producer.partitions_for(KAFKA_TOPIC)`
			`partition = random.choice(tuple(partitions))`
			`await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`print(`
			`"%d messages sent to partition %d" % (batch.record_count(), partition)`
			`)`
Ingest into Kafka and queue messages better 2 years ago			`batch = producer.create_batch()`
			`continue`

			`partitions = await producer.partitions_for(KAFKA_TOPIC)`
			`partition = random.choice(tuple(partitions))`
			`await producer.send_batch(batch, KAFKA_TOPIC, partition=partition)`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`print("%d messages sent to partition %d" % (batch.record_count(), partition))`
Ingest into Kafka and queue messages better 2 years ago			`await producer.stop()`

Properly process Redis buffered messages and ingest into Kafka 2 years ago
Ingest into Kafka and queue messages better 2 years ago			`# def store_message(msg):`
			`# """`
			`# Store a message into Manticore`
			`# :param msg: dict`
			`# """`
			`# store_kafka(msg)`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`# # Duplicated to avoid extra function call`
			`# if msg["type"] in TYPES_MAIN:`
			`# index = "main"`
			`# schema = mc_s.schema_main`
			`# elif msg["type"] in TYPES_META:`
			`# index = "meta"`
			`# schema = mc_s.schema_meta`
			`# elif msg["type"] in TYPES_INT:`
			`# index = "internal"`
			`# schema = mc_s.schema_int`
			`# # normalise fields`
			`# for key, value in list(msg.items()):`
			`# if value is None:`
			`# del msg[key]`
			`# if key in schema:`
			`# if isinstance(value, int):`
			`# if schema[key].startswith("string") or schema[key].startswith("text"):`
			`# msg[key] = str(value)`

			`# body = [{"insert": {"index": index, "doc": msg}}]`
			`# body_post = ""`
			`# for item in body:`
			`# body_post += ujson.dumps(item)`
			`# body_post += "\n"`

			`# # print(body_post)`
			`# try:`
			`# # Bulk index operations`
			`# print("FAKE POST")`
			`# #api_response = api_instance.bulk(body_post) # , async_req=True`
			`# # print(api_response)`
			`# except ApiException as e:`
			`# print("Exception when calling IndexApi->bulk: %s\n" % e)`
			`# print("ATTEMPT", body_post)`

Ingest into Kafka and queue messages better 2 years ago
			`async def queue_message(msg):`
			`"""`
			`Queue a message on the Redis buffer.`
			`"""`
			`src = msg["src"]`
			`message = ujson.dumps(msg)`
Implement threshold writing to Redis and manticore ingesting from Redis 2 years ago
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`key = f"{KEYPREFIX}{src}"`
Ingest into Kafka and queue messages better 2 years ago			`await ar.sadd(key, message)`
Reformat code 2 years ago
Properly process Redis buffered messages and ingest into Kafka 2 years ago
Ingest into Kafka and queue messages better 2 years ago			`async def queue_message_bulk(data):`
Implement running Discord and 4chan gathering simultaneously 2 years ago			`"""`
Ingest into Kafka and queue messages better 2 years ago			`Queue multiple messages on the Redis buffer.`
Implement running Discord and 4chan gathering simultaneously 2 years ago			`"""`
Ingest into Kafka and queue messages better 2 years ago			`for msg in data:`
			`src = msg["src"]`
			`message = ujson.dumps(msg)`

Properly process Redis buffered messages and ingest into Kafka 2 years ago			`key = f"{KEYPREFIX}{src}"`
Ingest into Kafka and queue messages better 2 years ago			`await ar.sadd(key, message)`


			`# For now, make a normal function until we go full async`
			`def queue_message_bulk_sync(data):`
Implement aiohttp 2 years ago			`"""`
Ingest into Kafka and queue messages better 2 years ago			`Queue multiple messages on the Redis buffer.`
Implement aiohttp 2 years ago			`"""`
Ingest into Kafka and queue messages better 2 years ago			`for msg in data:`
			`src = msg["src"]`
			`message = ujson.dumps(msg)`

			`key = "{KEYPREFIX}{src}"`
			`r.sadd(key, message)`


			`# def store_message_bulk(data):`
			`# """`
			`# Store a message into Manticore`
			`# :param msg: dict`
			`# """`
			`# if not data:`
			`# return`
			`# for msg in data:`
			`# store_kafka(msg)`
Properly process Redis buffered messages and ingest into Kafka 2 years ago			`# # 10000: maximum inserts we can submit to`
			`# # Manticore as of Sept 2022`
			`# split_posts = array_split(data, ceil(len(data) / 10000))`
			`# for messages in split_posts:`
			`# total = []`
			`# for msg in messages:`
			`# # Duplicated to avoid extra function call (see above)`
			`# if msg["type"] in TYPES_MAIN:`
			`# index = "main"`
			`# schema = mc_s.schema_main`
			`# elif msg["type"] in TYPES_META:`
			`# index = "meta"`
			`# schema = mc_s.schema_meta`
			`# elif msg["type"] in TYPES_INT:`
			`# index = "internal"`
			`# schema = mc_s.schema_int`
			`# # normalise fields`
			`# for key, value in list(msg.items()):`
			`# if value is None:`
			`# del msg[key]`
			`# if key in schema:`
			`# if isinstance(value, int):`
			`# if schema[key].startswith("string") or schema[key].startswith(`
			`# "text"`
			`# ):`
			`# msg[key] = str(value)`

			`# body = {"insert": {"index": index, "doc": msg}}`
			`# total.append(body)`

			`# body_post = ""`
			`# for item in total:`
			`# body_post += ujson.dumps(item)`
			`# body_post += "\n"`

			`# # print(body_post)`
			`# try:`
			`# # Bulk index operations`
			`# print("FAKE POST")`
			`# #api_response = api_instance.bulk(body_post) # , async_req=True`
			`# #print(api_response)`
			`# except ApiException as e:`
			`# print("Exception when calling IndexApi->bulk: %s\n" % e)`
			`# print("ATTEMPT", body_post)`
Ingest into Kafka and queue messages better 2 years ago

			`# def update_schema():`
			`# pass`


			`# def create_index(api_client):`
			`# util_instance = manticoresearch.UtilsApi(api_client)`
			`# schemas = {`
			`# "main": mc_s.schema_main,`
			`# "meta": mc_s.schema_meta,`
			`# "internal": mc_s.schema_int,`
			`# }`
			`# for name, schema in schemas.items():`
			`# schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])`

			`# create_query = (`
			`# f"create table if not exists {name}({schema_types}) engine='columnar'"`
			`# )`
			`# print("Schema types", create_query)`
			`# util_instance.sql(create_query)`
Begin implementing aiohttp 2 years ago

Properly process Redis buffered messages and ingest into Kafka 2 years ago			`# create_index(api_client)`
			`# update_schema()`