Implement indexing into Apache Druid #1

Closed
m wants to merge 263 commits from druid into master
5 changed files with 155 additions and 170 deletions
Showing only changes of commit 22cef33342 - Show all commits

45
db.py
View File

@ -19,8 +19,7 @@ def store_message(msg):
Store a message into Manticore Store a message into Manticore
:param msg: dict :param msg: dict
""" """
log.info(f"store_message() {msg}") print("DISCORD MSGLEN", len(msg["msg"]))
# normalise fields # normalise fields
for key, value in list(msg.items()): for key, value in list(msg.items()):
if value is None: if value is None:
@ -46,8 +45,46 @@ def store_message(msg):
#print(body_post) #print(body_post)
try: try:
# Bulk index operations # Bulk index operations
api_response = api_instance.bulk(body_post) api_response = api_instance.bulk(body_post, async_req=True)
pprint(api_response) #print(api_response)
except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e)
async def store_message_bulk(messages):
"""
Store a message into Manticore
:param msg: dict
"""
print("BULK", len(messages))
total = []
for msg in messages:
# normalise fields
for key, value in list(msg.items()):
if value is None:
del msg[key]
if key in schema:
if isinstance(value, int):
if schema[key].startswith("string"):
msg[key] = str(value)
body = {
"insert": {
"index": "main",
"doc": msg
}
}
total.append(body)
body_post = ""
for item in total:
body_post += json.dumps(item)
body_post += "\n"
#print(body_post)
try:
# Bulk index operations
api_response = api_instance.bulk(body_post, async_req=True)
#print(api_response)
except ApiException as e: except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e) print("Exception when calling IndexApi->bulk: %s\n" % e)

View File

@ -11,8 +11,8 @@ services:
volumes_from: volumes_from:
- tmp - tmp
depends_on: depends_on:
- "db" - db
- "redis"
db: db:
image: manticoresearch/manticore image: manticoresearch/manticore
@ -33,6 +33,7 @@ services:
volumes: volumes:
- ./docker/data:/var/lib/manticore - ./docker/data:/var/lib/manticore
tmp: tmp:
image: busybox image: busybox
command: chmod -R 777 /var/run/redis command: chmod -R 777 /var/run/redis

View File

@ -25,14 +25,9 @@ async def main(loop):
loop.create_task(client.start(token)) loop.create_task(client.start(token))
#client.run(token) #client.run(token)
# log.info("Starting 4chan handler.") log.info("Starting 4chan handler.")
# chan = Chan4() chan = Chan4()
# #running = chan.run() await chan.run()
# chan.run()
#deferred.addCallback(lambda: None)
#reactor.callLater(0.1, deferred.callback, None)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()

View File

@ -12,6 +12,8 @@ from siphashc import siphash
import db import db
import util import util
from schemas.ch4_s import ATTRMAP from schemas.ch4_s import ATTRMAP
import aiohttp
import asyncio
class Chan4(object): class Chan4(object):
""" """
@ -42,90 +44,54 @@ class Chan4(object):
self.hash_key = self.hash_key.decode("ascii") self.hash_key = self.hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {self.hash_key}") self.log.debug(f"Decoded hash key: {self.hash_key}")
@inlineCallbacks async def run(self):
def run(self): await self.get_board_list()
yield self.get_board_list()
def got_thread_lists(self, thread_lists): async def get_board_list(self):
print("GOT THREAD LIST", thread_lists) # responses = await self.api_call({"_": "boards.json"})
# Instead of while True, do it again! # for mapped, response in responses:
d = self.get_thread_lists() # if not response:
d.addCallback(self.got_thread_lists) # continue
# @inlineCallbacks # for board in response["boards"]:
# def mainloop(self): # self.boards.append(board["board"])
# while True: # self.log.debug(f"Got boards: {self.boards}")
# yield self.get_thread_lists()
# yield self.get_thread_contents()
@inlineCallbacks await self.get_thread_lists(self.boards)
def get_board_list(self):
self.log.debug("Getting board list")
response = self.api_call("boards.json")
response.addCallback(self.got_board_list)
yield response
@inlineCallbacks async def get_thread_lists(self, boards):
def got_board_list(self, board_list): self.log.debug(f"Getting thread list for {boards}")
if board_list["success"]: board_urls = {board: f"{board}/catalog.json" for board in boards}
for board in board_list["response"]["boards"]: responses = await self.api_call(board_urls)
self.boards.append(board["board"]) to_get = []
self.log.debug(f"Got boards: {self.boards}") for mapped, response in responses:
d = self.get_thread_lists() if not response:
d.addCallback(self.got_thread_lists) continue
yield d for page in response:
@inlineCallbacks
def get_thread_lists(self):
thread_deferreds = []
for board in self.boards:
d = self.get_thread_list(board)
d.addCallback(self.got_thread_list, board)
thread_deferreds.append(d)
yield defer.gatherResults(thread_deferreds)
def get_thread_list(self, board):
self.log.debug(f"Getting thread list for {board}")
response = self.api_call(f"{board}/catalog.json")
return response
def got_thread_list(self, thread_list, board):
if not thread_list:
self.log.error(f"Thread list invalid: {thread_list} {board}")
return
if thread_list["success"]:
#self.thread_list[board] = thread_list["response"]
for page in thread_list["response"]:
for threads in page["threads"]: for threads in page["threads"]:
no = threads["no"] no = threads["no"]
d = self.get_thread_content(board, no) to_get.append((mapped, no))
d.addCallback(self.got_thread_content, board, no)
self.log.info(f"Got thread list for {board}: {len(thread_list)}")
def get_thread_content(self, board, thread): self.log.info(f"Got thread list for {mapped}: {len(response)}")
self.log.debug(f"Getting information for thread {thread} on board {board}") await self.get_threads_content(to_get)
response = self.api_call(f"{board}/thread/{thread}.json")
return response
def got_thread_content(self, thread_content, board, thread): # Recurse
if not thread_content: await self.get_thread_lists(self.boards)
self.log.error(f"Thread content invalid: {thread_content} {board} {thread}")
return async def get_threads_content(self, thread_list):
if thread_content["success"]: thread_urls = {(board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list}
self.log.debug(f"Getting information for threads: {thread_urls}")
responses = await self.api_call(thread_urls)
self.log.debug(f"Got information for threads: {thread_urls}")
for mapped, response in responses:
if not response:
continue
board, thread = mapped
self.log.debug(f"Got thread content for thread {thread} on board {board}") self.log.debug(f"Got thread content for thread {thread} on board {board}")
for post in thread_content["response"]["posts"]: await self.handle_posts(board, thread, response["posts"])
# print(post)
self.handle_post(board, thread, post)
else:
self.log.error(
(
f"Error fetching thread {thread} on board {board}: "
f"{thread_content['message']}"
)
)
def handle_post(self, board, thread, post): async def handle_posts(self, board, thread, posts):
post["type"] = "msg" for index, post in enumerate(posts):
posts[index]["type"] = "msg"
# Calculate hash for post # Calculate hash for post
post_normalised = json.dumps(post, sort_keys=True) post_normalised = json.dumps(post, sort_keys=True)
@ -138,16 +104,15 @@ class Chan4(object):
if key_content == hash: if key_content == hash:
return return
else: else:
post["type"] = "update" posts[index]["type"] = "update"
db.r.set(redis_key, hash) db.r.set(redis_key, hash)
# Check if hash exists
# Store the hash
for key, value in list(post.items()): for key, value in list(post.items()):
if key in ATTRMAP: if key in ATTRMAP:
post[ATTRMAP[key]] = post[key] post[ATTRMAP[key]] = posts[index][key]
del post[key] del posts[index][key]
if "ts" in post: if "ts" in post:
old_time = post["ts"] old_time = posts[index]["ts"]
# '08/30/22(Tue)02:25:37' # '08/30/22(Tue)02:25:37'
time_spl = old_time.split(":") time_spl = old_time.split(":")
if len(time_spl) == 3: if len(time_spl) == 3:
@ -155,60 +120,47 @@ class Chan4(object):
else: else:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M") old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
new_ts = old_ts.isoformat() new_ts = old_ts.isoformat()
post["ts"] = new_ts posts[index]["ts"] = new_ts
if "msg" in post: if "msg" in post:
soup = BeautifulSoup(post["msg"], "html.parser") soup = BeautifulSoup(posts[index]["msg"], "html.parser")
msg = soup.get_text(separator="\n") msg = soup.get_text(separator="\n")
post["msg"] = msg posts[index]["msg"] = msg
post["src"] = "4ch" posts[index]["src"] = "4ch"
# print({name_map[name]: val for name, val in post.items()}) # print({name_map[name]: val for name, val in post.items()})
db.store_message(post) #print(f"Got posts: {len(posts)}")
await db.store_message_bulk(posts)
def dump(self, *args, **kwargs): async def fetch(self, url, session, mapped):
self.log.error(f"Error: {args} {kwargs}") async with session.get(url) as response:
return (mapped, await response.json())
@inlineCallbacks
def callback_api_call(self, response, result): async def bound_fetch(self, sem, url, session, mapped):
result["status"] = response.code # Getter function with semaphore.
async with sem:
try: try:
text = yield response.content() return await self.fetch(url, session, mapped)
except: # noqa except:
self.log.error("Error with API call") return (mapped, None)
return False
#print("RESP TEXT", text)
try:
result["response"] = json.loads(text)
except json.decoder.JSONDecodeError:
result["success"] = "ERROR"
result["message"] = "Error parsing JSON."
return result
#print("RESP AFTER JSON", result)
result["status"] = response.code
if response.code == 200:
result["success"] = True
result["message"] = "OK"
else:
result["message"] = "API ERROR"
return result async def api_call(self, methods={}):
def api_call(self, method: str):
headers = { headers = {
"User-Agent": ( "User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
) )
} }
tasks = []
sem = asyncio.Semaphore(100)
connector = aiohttp.TCPConnector(limit=None)
async with aiohttp.ClientSession(connector=connector) as session:
for mapped, method in methods.items():
url = f"{self.api_endpoint}/{method}" url = f"{self.api_endpoint}/{method}"
self.log.debug(f"GET {url}") self.log.debug(f"GET {url}")
response = treq.get(url, headers=headers) task = asyncio.create_task(self.bound_fetch(sem, url, session, mapped))
result: Dict[str, Any] = { #task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
"success": False, tasks.append(task)
"message": "Call not successful", responses = await asyncio.gather(*tasks)
"response": None, return responses
"status": None,
}
response.addCallback(self.callback_api_call, result)
response.addErrback(self.dump, url=url)
return response

View File

@ -3,7 +3,7 @@ import logging
log = logging.getLogger("util") log = logging.getLogger("util")
debug = False debug = True
# Color definitions # Color definitions
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)