Implement indexing into Apache Druid #1

Closed
m wants to merge 263 commits from druid into master
3 changed files with 17 additions and 7 deletions
Showing only changes of commit 318a8ddbd5 - Show all commits

2
db.py
View File

@ -53,6 +53,8 @@ def store_message_bulk(data):
:param msg: dict :param msg: dict
""" """
print("BULK", len(data)) print("BULK", len(data))
if not data:
return
split_posts = array_split(data, ceil(len(data) / 10000)) split_posts = array_split(data, ceil(len(data) / 10000))
for messages in split_posts: for messages in split_posts:
print("PROCESSING SPLIT OF", len(messages), "MESSAGES") print("PROCESSING SPLIT OF", len(messages), "MESSAGES")

View File

@ -18,6 +18,6 @@ COPY discord-patched.tgz /code/
RUN python -m venv /venv RUN python -m venv /venv
RUN . /venv/bin/activate && pip install -r requirements.txt RUN . /venv/bin/activate && pip install -r requirements.txt
RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.9/site-packages
CMD . /venv/bin/activate && exec python monolith.py CMD . /venv/bin/activate && exec python monolith.py

View File

@ -14,6 +14,9 @@ import db
import util import util
from schemas.ch4_s import ATTRMAP from schemas.ch4_s import ATTRMAP
from numpy import array_split
from math import ceil
p = ProcessPoolExecutor(10) p = ProcessPoolExecutor(10)
@ -75,7 +78,16 @@ class Chan4(object):
to_get.append((mapped, no)) to_get.append((mapped, no))
self.log.info(f"Got thread list for {mapped}: {len(response)}") self.log.info(f"Got thread list for {mapped}: {len(response)}")
await self.get_threads_content(to_get) print("THREAD LIST FULL LEN", len(to_get))
if not to_get:
await self.get_thread_lists(self.boards)
return
split_threads = array_split(to_get, ceil(len(to_get) / 10000))
print("SPLIT THREADS INTO", len(split_threads))
for threads in split_threads:
print("SUBMITTED THREADS FOR", len(threads))
await self.get_threads_content(threads)
#await self.get_threads_content(to_get)
# Recurse # Recurse
await self.get_thread_lists(self.boards) await self.get_thread_lists(self.boards)
@ -106,6 +118,7 @@ class Chan4(object):
# await self.handle_posts(board, thread, response["posts"]) # await self.handle_posts(board, thread, response["posts"])
# await asyncio.sleep(1) # await asyncio.sleep(1)
await self.handle_posts_thread(all_posts) await self.handle_posts_thread(all_posts)
# self.handle_posts(all_posts)
@asyncio.coroutine @asyncio.coroutine
def handle_posts_thread(self, posts): def handle_posts_thread(self, posts):
@ -114,12 +127,9 @@ class Chan4(object):
yield from loop.run_in_executor(p, self.handle_posts, posts) yield from loop.run_in_executor(p, self.handle_posts, posts)
def handle_posts(self, posts): def handle_posts(self, posts):
print("HANDLE POSTS START")
to_store = [] to_store = []
for key, post_list in posts.items(): for key, post_list in posts.items():
board, thread = key board, thread = key
print("PROCESSING BOARD", board, "THREAD", thread)
print("POSTS HERE", len(post_list))
for index, post in enumerate(post_list): for index, post in enumerate(post_list):
posts[key][index]["type"] = "msg" posts[key][index]["type"] = "msg"
@ -161,9 +171,7 @@ class Chan4(object):
# print({name_map[name]: val for name, val in post.items()}) # print({name_map[name]: val for name, val in post.items()})
# print(f"Got posts: {len(posts)}") # print(f"Got posts: {len(posts)}")
print("HANDLE POSTS DONE")
db.store_message_bulk(to_store) db.store_message_bulk(to_store)
print("STORE DB DONE")
async def fetch(self, url, session, mapped): async def fetch(self, url, session, mapped):
async with session.get(url) as response: async with session.get(url) as response: