Implement indexing into Apache Druid #1
2
db.py
2
db.py
|
@ -53,6 +53,8 @@ def store_message_bulk(data):
|
||||||
:param msg: dict
|
:param msg: dict
|
||||||
"""
|
"""
|
||||||
print("BULK", len(data))
|
print("BULK", len(data))
|
||||||
|
if not data:
|
||||||
|
return
|
||||||
split_posts = array_split(data, ceil(len(data) / 10000))
|
split_posts = array_split(data, ceil(len(data) / 10000))
|
||||||
for messages in split_posts:
|
for messages in split_posts:
|
||||||
print("PROCESSING SPLIT OF", len(messages), "MESSAGES")
|
print("PROCESSING SPLIT OF", len(messages), "MESSAGES")
|
||||||
|
|
|
@ -18,6 +18,6 @@ COPY discord-patched.tgz /code/
|
||||||
RUN python -m venv /venv
|
RUN python -m venv /venv
|
||||||
RUN . /venv/bin/activate && pip install -r requirements.txt
|
RUN . /venv/bin/activate && pip install -r requirements.txt
|
||||||
|
|
||||||
RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages
|
RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.9/site-packages
|
||||||
|
|
||||||
CMD . /venv/bin/activate && exec python monolith.py
|
CMD . /venv/bin/activate && exec python monolith.py
|
|
@ -14,6 +14,9 @@ import db
|
||||||
import util
|
import util
|
||||||
from schemas.ch4_s import ATTRMAP
|
from schemas.ch4_s import ATTRMAP
|
||||||
|
|
||||||
|
from numpy import array_split
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
p = ProcessPoolExecutor(10)
|
p = ProcessPoolExecutor(10)
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,7 +78,16 @@ class Chan4(object):
|
||||||
to_get.append((mapped, no))
|
to_get.append((mapped, no))
|
||||||
|
|
||||||
self.log.info(f"Got thread list for {mapped}: {len(response)}")
|
self.log.info(f"Got thread list for {mapped}: {len(response)}")
|
||||||
await self.get_threads_content(to_get)
|
print("THREAD LIST FULL LEN", len(to_get))
|
||||||
|
if not to_get:
|
||||||
|
await self.get_thread_lists(self.boards)
|
||||||
|
return
|
||||||
|
split_threads = array_split(to_get, ceil(len(to_get) / 10000))
|
||||||
|
print("SPLIT THREADS INTO", len(split_threads))
|
||||||
|
for threads in split_threads:
|
||||||
|
print("SUBMITTED THREADS FOR", len(threads))
|
||||||
|
await self.get_threads_content(threads)
|
||||||
|
#await self.get_threads_content(to_get)
|
||||||
|
|
||||||
# Recurse
|
# Recurse
|
||||||
await self.get_thread_lists(self.boards)
|
await self.get_thread_lists(self.boards)
|
||||||
|
@ -106,6 +118,7 @@ class Chan4(object):
|
||||||
# await self.handle_posts(board, thread, response["posts"])
|
# await self.handle_posts(board, thread, response["posts"])
|
||||||
# await asyncio.sleep(1)
|
# await asyncio.sleep(1)
|
||||||
await self.handle_posts_thread(all_posts)
|
await self.handle_posts_thread(all_posts)
|
||||||
|
# self.handle_posts(all_posts)
|
||||||
|
|
||||||
@asyncio.coroutine
|
@asyncio.coroutine
|
||||||
def handle_posts_thread(self, posts):
|
def handle_posts_thread(self, posts):
|
||||||
|
@ -114,12 +127,9 @@ class Chan4(object):
|
||||||
yield from loop.run_in_executor(p, self.handle_posts, posts)
|
yield from loop.run_in_executor(p, self.handle_posts, posts)
|
||||||
|
|
||||||
def handle_posts(self, posts):
|
def handle_posts(self, posts):
|
||||||
print("HANDLE POSTS START")
|
|
||||||
to_store = []
|
to_store = []
|
||||||
for key, post_list in posts.items():
|
for key, post_list in posts.items():
|
||||||
board, thread = key
|
board, thread = key
|
||||||
print("PROCESSING BOARD", board, "THREAD", thread)
|
|
||||||
print("POSTS HERE", len(post_list))
|
|
||||||
for index, post in enumerate(post_list):
|
for index, post in enumerate(post_list):
|
||||||
posts[key][index]["type"] = "msg"
|
posts[key][index]["type"] = "msg"
|
||||||
|
|
||||||
|
@ -161,9 +171,7 @@ class Chan4(object):
|
||||||
|
|
||||||
# print({name_map[name]: val for name, val in post.items()})
|
# print({name_map[name]: val for name, val in post.items()})
|
||||||
# print(f"Got posts: {len(posts)}")
|
# print(f"Got posts: {len(posts)}")
|
||||||
print("HANDLE POSTS DONE")
|
|
||||||
db.store_message_bulk(to_store)
|
db.store_message_bulk(to_store)
|
||||||
print("STORE DB DONE")
|
|
||||||
|
|
||||||
async def fetch(self, url, session, mapped):
|
async def fetch(self, url, session, mapped):
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
|
|
Loading…
Reference in New Issue