Implement aiohttp

pull/1/head
Mark Veidemanis 2 years ago
parent 663a26778d
commit 22cef33342
Signed by: m
GPG Key ID: 5ACFCEED46C0904F

45
db.py

@ -19,8 +19,7 @@ def store_message(msg):
Store a message into Manticore Store a message into Manticore
:param msg: dict :param msg: dict
""" """
log.info(f"store_message() {msg}") print("DISCORD MSGLEN", len(msg["msg"]))
# normalise fields # normalise fields
for key, value in list(msg.items()): for key, value in list(msg.items()):
if value is None: if value is None:
@ -46,8 +45,46 @@ def store_message(msg):
#print(body_post) #print(body_post)
try: try:
# Bulk index operations # Bulk index operations
api_response = api_instance.bulk(body_post) api_response = api_instance.bulk(body_post, async_req=True)
pprint(api_response) #print(api_response)
except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e)
async def store_message_bulk(messages):
"""
Store a message into Manticore
:param msg: dict
"""
print("BULK", len(messages))
total = []
for msg in messages:
# normalise fields
for key, value in list(msg.items()):
if value is None:
del msg[key]
if key in schema:
if isinstance(value, int):
if schema[key].startswith("string"):
msg[key] = str(value)
body = {
"insert": {
"index": "main",
"doc": msg
}
}
total.append(body)
body_post = ""
for item in total:
body_post += json.dumps(item)
body_post += "\n"
#print(body_post)
try:
# Bulk index operations
api_response = api_instance.bulk(body_post, async_req=True)
#print(api_response)
except ApiException as e: except ApiException as e:
print("Exception when calling IndexApi->bulk: %s\n" % e) print("Exception when calling IndexApi->bulk: %s\n" % e)

@ -11,8 +11,8 @@ services:
volumes_from: volumes_from:
- tmp - tmp
depends_on: depends_on:
- "db" - db
- "redis"
db: db:
image: manticoresearch/manticore image: manticoresearch/manticore
@ -33,6 +33,7 @@ services:
volumes: volumes:
- ./docker/data:/var/lib/manticore - ./docker/data:/var/lib/manticore
tmp: tmp:
image: busybox image: busybox
command: chmod -R 777 /var/run/redis command: chmod -R 777 /var/run/redis
@ -40,12 +41,12 @@ services:
- /var/run/redis - /var/run/redis
redis: redis:
image: redis image: redis
command: redis-server /etc/redis.conf command: redis-server /etc/redis.conf
volumes: volumes:
- ${PORTAINER_GIT_DIR}/docker/redis.conf:/etc/redis.conf - ${PORTAINER_GIT_DIR}/docker/redis.conf:/etc/redis.conf
volumes_from: volumes_from:
- tmp - tmp
networks: networks:
default: default:

@ -25,14 +25,9 @@ async def main(loop):
loop.create_task(client.start(token)) loop.create_task(client.start(token))
#client.run(token) #client.run(token)
# log.info("Starting 4chan handler.") log.info("Starting 4chan handler.")
# chan = Chan4() chan = Chan4()
# #running = chan.run() await chan.run()
# chan.run()
#deferred.addCallback(lambda: None)
#reactor.callLater(0.1, deferred.callback, None)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()

@ -12,6 +12,8 @@ from siphashc import siphash
import db import db
import util import util
from schemas.ch4_s import ATTRMAP from schemas.ch4_s import ATTRMAP
import aiohttp
import asyncio
class Chan4(object): class Chan4(object):
""" """
@ -42,173 +44,123 @@ class Chan4(object):
self.hash_key = self.hash_key.decode("ascii") self.hash_key = self.hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {self.hash_key}") self.log.debug(f"Decoded hash key: {self.hash_key}")
@inlineCallbacks async def run(self):
def run(self): await self.get_board_list()
yield self.get_board_list()
async def get_board_list(self):
def got_thread_lists(self, thread_lists): # responses = await self.api_call({"_": "boards.json"})
print("GOT THREAD LIST", thread_lists) # for mapped, response in responses:
# Instead of while True, do it again! # if not response:
d = self.get_thread_lists() # continue
d.addCallback(self.got_thread_lists) # for board in response["boards"]:
# @inlineCallbacks # self.boards.append(board["board"])
# def mainloop(self): # self.log.debug(f"Got boards: {self.boards}")
# while True:
# yield self.get_thread_lists() await self.get_thread_lists(self.boards)
# yield self.get_thread_contents()
async def get_thread_lists(self, boards):
@inlineCallbacks self.log.debug(f"Getting thread list for {boards}")
def get_board_list(self): board_urls = {board: f"{board}/catalog.json" for board in boards}
self.log.debug("Getting board list") responses = await self.api_call(board_urls)
response = self.api_call("boards.json") to_get = []
response.addCallback(self.got_board_list) for mapped, response in responses:
yield response if not response:
continue
@inlineCallbacks for page in response:
def got_board_list(self, board_list):
if board_list["success"]:
for board in board_list["response"]["boards"]:
self.boards.append(board["board"])
self.log.debug(f"Got boards: {self.boards}")
d = self.get_thread_lists()
d.addCallback(self.got_thread_lists)
yield d
@inlineCallbacks
def get_thread_lists(self):
thread_deferreds = []
for board in self.boards:
d = self.get_thread_list(board)
d.addCallback(self.got_thread_list, board)
thread_deferreds.append(d)
yield defer.gatherResults(thread_deferreds)
def get_thread_list(self, board):
self.log.debug(f"Getting thread list for {board}")
response = self.api_call(f"{board}/catalog.json")
return response
def got_thread_list(self, thread_list, board):
if not thread_list:
self.log.error(f"Thread list invalid: {thread_list} {board}")
return
if thread_list["success"]:
#self.thread_list[board] = thread_list["response"]
for page in thread_list["response"]:
for threads in page["threads"]: for threads in page["threads"]:
no = threads["no"] no = threads["no"]
d = self.get_thread_content(board, no) to_get.append((mapped, no))
d.addCallback(self.got_thread_content, board, no)
self.log.info(f"Got thread list for {board}: {len(thread_list)}") self.log.info(f"Got thread list for {mapped}: {len(response)}")
await self.get_threads_content(to_get)
def get_thread_content(self, board, thread):
self.log.debug(f"Getting information for thread {thread} on board {board}") # Recurse
response = self.api_call(f"{board}/thread/{thread}.json") await self.get_thread_lists(self.boards)
return response
async def get_threads_content(self, thread_list):
def got_thread_content(self, thread_content, board, thread): thread_urls = {(board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list}
if not thread_content: self.log.debug(f"Getting information for threads: {thread_urls}")
self.log.error(f"Thread content invalid: {thread_content} {board} {thread}") responses = await self.api_call(thread_urls)
return self.log.debug(f"Got information for threads: {thread_urls}")
if thread_content["success"]: for mapped, response in responses:
if not response:
continue
board, thread = mapped
self.log.debug(f"Got thread content for thread {thread} on board {board}") self.log.debug(f"Got thread content for thread {thread} on board {board}")
for post in thread_content["response"]["posts"]: await self.handle_posts(board, thread, response["posts"])
# print(post)
self.handle_post(board, thread, post) async def handle_posts(self, board, thread, posts):
else: for index, post in enumerate(posts):
self.log.error( posts[index]["type"] = "msg"
(
f"Error fetching thread {thread} on board {board}: " # Calculate hash for post
f"{thread_content['message']}" post_normalised = json.dumps(post, sort_keys=True)
) hash = siphash(self.hash_key, post_normalised)
) hash = str(hash)
redis_key = f"cache.{board}.{thread}.{post['no']}"
def handle_post(self, board, thread, post): key_content = db.r.get(redis_key)
post["type"] = "msg" if key_content:
key_content = key_content.decode("ascii")
# Calculate hash for post if key_content == hash:
post_normalised = json.dumps(post, sort_keys=True) return
hash = siphash(self.hash_key, post_normalised) else:
hash = str(hash) posts[index]["type"] = "update"
redis_key = f"cache.{board}.{thread}.{post['no']}" db.r.set(redis_key, hash)
key_content = db.r.get(redis_key)
if key_content: for key, value in list(post.items()):
key_content = key_content.decode("ascii") if key in ATTRMAP:
if key_content == hash: post[ATTRMAP[key]] = posts[index][key]
return del posts[index][key]
else: if "ts" in post:
post["type"] = "update" old_time = posts[index]["ts"]
db.r.set(redis_key, hash) # '08/30/22(Tue)02:25:37'
# Check if hash exists time_spl = old_time.split(":")
# Store the hash if len(time_spl) == 3:
for key, value in list(post.items()): old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
if key in ATTRMAP: else:
post[ATTRMAP[key]] = post[key] old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
del post[key] new_ts = old_ts.isoformat()
if "ts" in post: posts[index]["ts"] = new_ts
old_time = post["ts"] if "msg" in post:
# '08/30/22(Tue)02:25:37' soup = BeautifulSoup(posts[index]["msg"], "html.parser")
time_spl = old_time.split(":") msg = soup.get_text(separator="\n")
if len(time_spl) == 3: posts[index]["msg"] = msg
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
else: posts[index]["src"] = "4ch"
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
new_ts = old_ts.isoformat() # print({name_map[name]: val for name, val in post.items()})
post["ts"] = new_ts #print(f"Got posts: {len(posts)}")
if "msg" in post: await db.store_message_bulk(posts)
soup = BeautifulSoup(post["msg"], "html.parser")
msg = soup.get_text(separator="\n") async def fetch(self, url, session, mapped):
post["msg"] = msg async with session.get(url) as response:
return (mapped, await response.json())
post["src"] = "4ch"
# print({name_map[name]: val for name, val in post.items()}) async def bound_fetch(self, sem, url, session, mapped):
db.store_message(post) # Getter function with semaphore.
async with sem:
def dump(self, *args, **kwargs): try:
self.log.error(f"Error: {args} {kwargs}") return await self.fetch(url, session, mapped)
except:
@inlineCallbacks return (mapped, None)
def callback_api_call(self, response, result):
result["status"] = response.code async def api_call(self, methods={}):
try:
text = yield response.content()
except: # noqa
self.log.error("Error with API call")
return False
#print("RESP TEXT", text)
try:
result["response"] = json.loads(text)
except json.decoder.JSONDecodeError:
result["success"] = "ERROR"
result["message"] = "Error parsing JSON."
return result
#print("RESP AFTER JSON", result)
result["status"] = response.code
if response.code == 200:
result["success"] = True
result["message"] = "OK"
else:
result["message"] = "API ERROR"
return result
def api_call(self, method: str):
headers = { headers = {
"User-Agent": ( "User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
) )
} }
url = f"{self.api_endpoint}/{method}" tasks = []
self.log.debug(f"GET {url}") sem = asyncio.Semaphore(100)
response = treq.get(url, headers=headers) connector = aiohttp.TCPConnector(limit=None)
result: Dict[str, Any] = { async with aiohttp.ClientSession(connector=connector) as session:
"success": False, for mapped, method in methods.items():
"message": "Call not successful", url = f"{self.api_endpoint}/{method}"
"response": None, self.log.debug(f"GET {url}")
"status": None, task = asyncio.create_task(self.bound_fetch(sem, url, session, mapped))
} #task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
response.addCallback(self.callback_api_call, result) tasks.append(task)
response.addErrback(self.dump, url=url) responses = await asyncio.gather(*tasks)
return response return responses

@ -3,7 +3,7 @@ import logging
log = logging.getLogger("util") log = logging.getLogger("util")
debug = False debug = True
# Color definitions # Color definitions
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)

Loading…
Cancel
Save