Implement indexing into Apache Druid #1

Closed
m wants to merge 263 commits from druid into master
7 changed files with 50 additions and 130 deletions
Showing only changes of commit 20e22ae7ca - Show all commits

32
db.py
View File

@ -1,12 +1,13 @@
from math import ceil
import manticoresearch import manticoresearch
import ujson
from manticoresearch.rest import ApiException from manticoresearch.rest import ApiException
from numpy import array_split
from redis import StrictRedis from redis import StrictRedis
import util import util
from schemas.mc_s import schema from schemas.mc_s import schema
import ujson
from numpy import array_split
from math import ceil
configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308") configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")
api_client = manticoresearch.ApiClient(configuration) api_client = manticoresearch.ApiClient(configuration)
@ -15,6 +16,7 @@ api_instance = manticoresearch.IndexApi(api_client)
log = util.get_logger("db") log = util.get_logger("db")
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0) r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)
def store_message(msg): def store_message(msg):
""" """
Store a message into Manticore Store a message into Manticore
@ -30,14 +32,7 @@ def store_message(msg):
if schema[key].startswith("string"): if schema[key].startswith("string"):
msg[key] = str(value) msg[key] = str(value)
body = [ body = [{"insert": {"index": "main", "doc": msg}}]
{
"insert": {
"index": "main",
"doc": msg
}
}
]
body_post = "" body_post = ""
for item in body: for item in body:
body_post += ujson.dumps(item) body_post += ujson.dumps(item)
@ -46,11 +41,12 @@ def store_message(msg):
# print(body_post) # print(body_post)
try: try:
# Bulk index operations # Bulk index operations
api_response = api_instance.bulk(body_post, async_req=True) api_instance.bulk(body_post, async_req=True)
# print(api_response) # print(api_response)
except ApiException as e: except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e) print("Exception when calling IndexApi->bulk: %s\n" % e)
def store_message_bulk(data): def store_message_bulk(data):
""" """
Store a message into Manticore Store a message into Manticore
@ -71,12 +67,7 @@ def store_message_bulk(data):
if schema[key].startswith("string"): if schema[key].startswith("string"):
msg[key] = str(value) msg[key] = str(value)
body = { body = {"insert": {"index": "main", "doc": msg}}
"insert": {
"index": "main",
"doc": msg
}
}
total.append(body) total.append(body)
body_post = "" body_post = ""
@ -87,7 +78,7 @@ def store_message_bulk(data):
# print(body_post) # print(body_post)
try: try:
# Bulk index operations # Bulk index operations
api_response = api_instance.bulk(body_post, async_req=True) api_instance.bulk(body_post, async_req=True)
# print(api_response) # print(api_response)
except ApiException as e: except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e) print("Exception when calling IndexApi->bulk: %s\n" % e)
@ -95,9 +86,11 @@ def store_message_bulk(data):
print("BULK FINISH") print("BULK FINISH")
def update_schema(): def update_schema():
pass pass
def create_index(api_client): def create_index(api_client):
util_instance = manticoresearch.UtilsApi(api_client) util_instance = manticoresearch.UtilsApi(api_client)
schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()]) schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])
@ -107,6 +100,5 @@ def create_index(api_client):
util_instance.sql(create_query) util_instance.sql(create_query)
create_index(api_client) create_index(api_client)
update_schema() update_schema()

View File

@ -8,6 +8,7 @@ from sources.dis import DiscordClient
# For development # For development
if not getenv("DISCORD_TOKEN", None): if not getenv("DISCORD_TOKEN", None):
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
log = util.get_logger("monolith") log = util.get_logger("monolith")

View File

@ -1,199 +1,132 @@
schema = { schema = {
"id": "bigint", "id": "bigint",
# 1 # 1
"archived": "int", "archived": "int",
# 1662150538 # 1662150538
"archived_on": "string indexed attribute", "archived_on": "string indexed attribute",
# CF # CF
"board_flag": "string indexed attribute", "board_flag": "string indexed attribute",
# true, false # true, false
"bot": "bool", "bot": "bool",
# 0 # 0
"bumplimit": "int", "bumplimit": "int",
# mod # mod
"capcode": "string indexed attribute", "capcode": "string indexed attribute",
# 393598265, #main, Rust Programmer's Club # 393598265, #main, Rust Programmer's Club
"channel": "text", "channel": "text",
# Miscellaneous # Miscellaneous
"channel_category": "text", "channel_category": "text",
# 360581491907887100 # 360581491907887100
"channel_category_id": "string indexed attribute", "channel_category_id": "string indexed attribute",
# true, false # true, false
"channel_category_nsfw": "bool", "channel_category_nsfw": "bool",
# 734229101216530600 # 734229101216530600
"channel_id": "string indexed attribute", "channel_id": "string indexed attribute",
# true, false # true, false
"channel_nsfw": "bool", "channel_nsfw": "bool",
# 1 # 1
"closed": "int", "closed": "int",
# GB # GB
"country": "string indexed attribute", "country": "string indexed attribute",
# United Kingdom # United Kingdom
"country_name": "text", "country_name": "text",
# 5 # 5
"file_custom_spoiler": "int", "file_custom_spoiler": "int",
# 1 # 1
"file_deleted": "int", "file_deleted": "int",
# .jpg # .jpg
"file_ext": "string indexed attribute", "file_ext": "string indexed attribute",
# 1024 # 1024
"file_h": "int", "file_h": "int",
# 1 # 1
"file_m_img": "int", "file_m_img": "int",
# tlArbrZDj7kbheSKPyDU0w== # tlArbrZDj7kbheSKPyDU0w==
"file_md5": "string indexed attribute", "file_md5": "string indexed attribute",
# 88967 # 88967
"file_size": "int", "file_size": "int",
# 1 # 1
"file_spoiler": "int", "file_spoiler": "int",
# 1662149436322819 # 1662149436322819
"file_tim": "string indexed attribute", "file_tim": "string indexed attribute",
# 250 # 250
"file_tn_h": "int", "file_tn_h": "int",
# 241 # 241
"file_tn_w": "int", "file_tn_w": "int",
# 1080 # 1080
"file_w": "int", "file_w": "int",
# 6E646BED-297E-4B4F-9082-31EDADC49472 # 6E646BED-297E-4B4F-9082-31EDADC49472
"filename": "text", "filename": "text",
# Confederate # Confederate
"flag_name": "string indexed attribute", "flag_name": "string indexed attribute",
"guild": "text", # LEGACY -> channel "guild": "text", # LEGACY -> channel
"guild_id": "string indexed attribute", # LEGACY -> channel_id "guild_id": "string indexed attribute", # LEGACY -> channel_id
# 36180 # 36180
"guild_member_count": "int", # ? -> channel_member_count "guild_member_count": "int", # ? -> channel_member_count
# 9f7b2e6a0e9b # 9f7b2e6a0e9b
"host": "text", "host": "text",
# 2447746 # 2447746
"id_reply": "string indexed attribute", # resto "id_reply": "string indexed attribute", # resto
# "522, trans rights shill", myname # "522, trans rights shill", myname
"ident": "text", "ident": "text",
# 0 # 0
"imagelimit": "int", "imagelimit": "int",
# 0 # 0
"images": "int", "images": "int",
# 0 # 0
"mode": "string indexed attribute", "mode": "string indexed attribute",
# b0n3 # b0n3
"modearg": "string indexed attribute", "modearg": "string indexed attribute",
# The quick brown fox jumped over the lazy dog # The quick brown fox jumped over the lazy dog
"msg": "text", "msg": "text",
# 393605030 # 393605030
"msg_id": "string indexed attribute", "msg_id": "string indexed attribute",
# pol # pol
"net": "text", "net": "text",
# 273534239310479360 # 273534239310479360
"net_id": "string indexed attribute", "net_id": "string indexed attribute",
# André de Santa Cruz, santa # André de Santa Cruz, santa
"nick": "text", "nick": "text",
# 773802568324350000 # 773802568324350000
"nick_id": "string indexed attribute", "nick_id": "string indexed attribute",
# 1, 2, 3, 4, 5, 6, ... # 1, 2, 3, 4, 5, 6, ...
"num": "int", "num": "int",
# 12 # 12
"replies": "int", "replies": "int",
# redacted-hate-thread # redacted-hate-thread
"semantic_url": "string indexed attribute", "semantic_url": "string indexed attribute",
# -1 -> 1 as float # -1 -> 1 as float
"sentiment": "float", "sentiment": "float",
# 2022 # 2022
"since4pass": "int", "since4pass": "int",
# 4ch, irc, dis # 4ch, irc, dis
"src": "string indexed attribute", "src": "string indexed attribute",
# true, false # true, false
"status": "bool", "status": "bool",
# 1 # 1
"sticky": "int", "sticky": "int",
# 1000 # 1000
"sticky_cap": "int", "sticky_cap": "int",
# Redacted Hate Thread, Gorbachev is dead # Redacted Hate Thread, Gorbachev is dead
"sub": "string indexed attribute", "sub": "string indexed attribute",
# Loop # Loop
"tag": "string indexed attribute", "tag": "string indexed attribute",
# 100 # 100
"tail_size": "int", "tail_size": "int",
"time": "timestamp", # LEGACY -> ts "time": "timestamp", # LEGACY -> ts
"tokens": "text", # ??? "tokens": "text", # ???
# 2022-09-02T16:10:36 # 2022-09-02T16:10:36
"ts": "timestamp", "ts": "timestamp",
# msg, notice, update, who # msg, notice, update, who
"type": "string indexed attribute", "type": "string indexed attribute",
# 10 # 10
"unique_ips": "int", "unique_ips": "int",
# 1662149436 # 1662149436
"unix_time": "string indexed attribute", "unix_time": "string indexed attribute",
# Anonymous # Anonymous
"user": "text", "user": "text",
"user_id": "string indexed attribute", # LEGACY -> nick_id "user_id": "string indexed attribute", # LEGACY -> nick_id
# 1, 2 # 1, 2
"version_sentiment": "int", "version_sentiment": "int",
# 1, 2 # 1, 2
"version_tokens": "int", "version_tokens": "int",
} }

View File

@ -1,23 +1,22 @@
# Python modules can't start with a number... # Python modules can't start with a number...
import ujson import asyncio
import random import random
import string import string
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime from datetime import datetime
import aiohttp
import ujson
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from siphashc import siphash from siphashc import siphash
import db import db
import util import util
from schemas.ch4_s import ATTRMAP from schemas.ch4_s import ATTRMAP
import aiohttp
import asyncio
from numpy import array_split
from math import ceil
from concurrent.futures import ProcessPoolExecutor
p = ProcessPoolExecutor(10) p = ProcessPoolExecutor(10)
class Chan4(object): class Chan4(object):
""" """
4chan indexer, crawler and ingester. 4chan indexer, crawler and ingester.
@ -82,7 +81,10 @@ class Chan4(object):
await self.get_thread_lists(self.boards) await self.get_thread_lists(self.boards)
async def get_threads_content(self, thread_list): async def get_threads_content(self, thread_list):
thread_urls = {(board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list} thread_urls = {
(board, thread): f"{board}/thread/{thread}.json"
for board, thread in thread_list
}
self.log.debug(f"Getting information for threads: {thread_urls}") self.log.debug(f"Getting information for threads: {thread_urls}")
responses = await self.api_call(thread_urls) responses = await self.api_call(thread_urls)
self.log.debug(f"Got information for threads: {thread_urls}") self.log.debug(f"Got information for threads: {thread_urls}")
@ -167,26 +169,20 @@ class Chan4(object):
async with session.get(url) as response: async with session.get(url) as response:
try: try:
return (mapped, await response.json()) return (mapped, await response.json())
except: except: # noqa
print("FETCH ERROR") print("FETCH ERROR")
return (mapped, None) return (mapped, None)
async def bound_fetch(self, sem, url, session, mapped): async def bound_fetch(self, sem, url, session, mapped):
# Getter function with semaphore. # Getter function with semaphore.
async with sem: async with sem:
try: try:
return await self.fetch(url, session, mapped) return await self.fetch(url, session, mapped)
except: except: # noqa
print("BOUND ERROR") print("BOUND ERROR")
return (mapped, None) return (mapped, None)
async def api_call(self, methods={}): async def api_call(self, methods={}):
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
)
}
tasks = [] tasks = []
sem = asyncio.Semaphore(100) sem = asyncio.Semaphore(100)
connector = aiohttp.TCPConnector(limit=None) connector = aiohttp.TCPConnector(limit=None)
@ -199,4 +195,3 @@ class Chan4(object):
tasks.append(task) tasks.append(task)
responses = await asyncio.gather(*tasks) responses = await asyncio.gather(*tasks)
return responses return responses

View File

@ -5,7 +5,6 @@ import discord
import db import db
import util import util
from schemas.dis_s import ATTRMAP from schemas.dis_s import ATTRMAP