Implement indexing into Apache Druid #1
2
db.py
2
db.py
|
@ -53,6 +53,8 @@ def store_message_bulk(data):
|
|||
:param msg: dict
|
||||
"""
|
||||
print("BULK", len(data))
|
||||
if not data:
|
||||
return
|
||||
split_posts = array_split(data, ceil(len(data) / 10000))
|
||||
for messages in split_posts:
|
||||
print("PROCESSING SPLIT OF", len(messages), "MESSAGES")
|
||||
|
|
|
@ -18,6 +18,6 @@ COPY discord-patched.tgz /code/
|
|||
RUN python -m venv /venv
|
||||
RUN . /venv/bin/activate && pip install -r requirements.txt
|
||||
|
||||
RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages
|
||||
RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.9/site-packages
|
||||
|
||||
CMD . /venv/bin/activate && exec python monolith.py
|
|
@ -14,6 +14,9 @@ import db
|
|||
import util
|
||||
from schemas.ch4_s import ATTRMAP
|
||||
|
||||
from numpy import array_split
|
||||
from math import ceil
|
||||
|
||||
p = ProcessPoolExecutor(10)
|
||||
|
||||
|
||||
|
@ -75,7 +78,16 @@ class Chan4(object):
|
|||
to_get.append((mapped, no))
|
||||
|
||||
self.log.info(f"Got thread list for {mapped}: {len(response)}")
|
||||
await self.get_threads_content(to_get)
|
||||
print("THREAD LIST FULL LEN", len(to_get))
|
||||
if not to_get:
|
||||
await self.get_thread_lists(self.boards)
|
||||
return
|
||||
split_threads = array_split(to_get, ceil(len(to_get) / 10000))
|
||||
print("SPLIT THREADS INTO", len(split_threads))
|
||||
for threads in split_threads:
|
||||
print("SUBMITTED THREADS FOR", len(threads))
|
||||
await self.get_threads_content(threads)
|
||||
#await self.get_threads_content(to_get)
|
||||
|
||||
# Recurse
|
||||
await self.get_thread_lists(self.boards)
|
||||
|
@ -106,6 +118,7 @@ class Chan4(object):
|
|||
# await self.handle_posts(board, thread, response["posts"])
|
||||
# await asyncio.sleep(1)
|
||||
await self.handle_posts_thread(all_posts)
|
||||
# self.handle_posts(all_posts)
|
||||
|
||||
@asyncio.coroutine
|
||||
def handle_posts_thread(self, posts):
|
||||
|
@ -114,12 +127,9 @@ class Chan4(object):
|
|||
yield from loop.run_in_executor(p, self.handle_posts, posts)
|
||||
|
||||
def handle_posts(self, posts):
|
||||
print("HANDLE POSTS START")
|
||||
to_store = []
|
||||
for key, post_list in posts.items():
|
||||
board, thread = key
|
||||
print("PROCESSING BOARD", board, "THREAD", thread)
|
||||
print("POSTS HERE", len(post_list))
|
||||
for index, post in enumerate(post_list):
|
||||
posts[key][index]["type"] = "msg"
|
||||
|
||||
|
@ -161,9 +171,7 @@ class Chan4(object):
|
|||
|
||||
# print({name_map[name]: val for name, val in post.items()})
|
||||
# print(f"Got posts: {len(posts)}")
|
||||
print("HANDLE POSTS DONE")
|
||||
db.store_message_bulk(to_store)
|
||||
print("STORE DB DONE")
|
||||
|
||||
async def fetch(self, url, session, mapped):
|
||||
async with session.get(url) as response:
|
||||
|
|
Loading…
Reference in New Issue