# Python modules can't start with a number... import json import random import string from datetime import datetime from typing import Any, Dict import treq from bs4 import BeautifulSoup from siphashc import siphash import db import util from schemas.ch4_s import ATTRMAP import aiohttp import asyncio class Chan4(object): """ 4chan indexer, crawler and ingester. """ def __init__(self): name = self.__class__.__name__ self.log = util.get_logger(name) self.api_endpoint = "https://a.4cdn.org" self.boards = ["out"] self.thread_list = {} #self.thread_deferreds = [] #self.content_deferreds = [] self.log.info(f"Starting crawler bot to {self.api_endpoint}") self.hash_key = db.r.get("hashing_key") if not self.hash_key: letters = string.ascii_lowercase self.hash_key = "".join(random.choice(letters) for i in range(16)) self.log.debug(f"Created new hash key: {self.hash_key}") db.r.set("hashing_key", self.hash_key) else: self.hash_key = self.hash_key.decode("ascii") self.log.debug(f"Decoded hash key: {self.hash_key}") async def run(self): await self.get_board_list() async def get_board_list(self): # responses = await self.api_call({"_": "boards.json"}) # for mapped, response in responses: # if not response: # continue # for board in response["boards"]: # self.boards.append(board["board"]) # self.log.debug(f"Got boards: {self.boards}") await self.get_thread_lists(self.boards) async def get_thread_lists(self, boards): self.log.debug(f"Getting thread list for {boards}") board_urls = {board: f"{board}/catalog.json" for board in boards} responses = await self.api_call(board_urls) to_get = [] for mapped, response in responses: if not response: continue for page in response: for threads in page["threads"]: no = threads["no"] to_get.append((mapped, no)) self.log.info(f"Got thread list for {mapped}: {len(response)}") await self.get_threads_content(to_get) # Recurse await self.get_thread_lists(self.boards) async def get_threads_content(self, thread_list): thread_urls = {(board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list} self.log.debug(f"Getting information for threads: {thread_urls}") responses = await self.api_call(thread_urls) self.log.debug(f"Got information for threads: {thread_urls}") for mapped, response in responses: if not response: continue board, thread = mapped self.log.debug(f"Got thread content for thread {thread} on board {board}") await self.handle_posts(board, thread, response["posts"]) async def handle_posts(self, board, thread, posts): for index, post in enumerate(posts): posts[index]["type"] = "msg" # Calculate hash for post post_normalised = json.dumps(post, sort_keys=True) hash = siphash(self.hash_key, post_normalised) hash = str(hash) redis_key = f"cache.{board}.{thread}.{post['no']}" key_content = db.r.get(redis_key) if key_content: key_content = key_content.decode("ascii") if key_content == hash: return else: posts[index]["type"] = "update" db.r.set(redis_key, hash) for key, value in list(post.items()): if key in ATTRMAP: post[ATTRMAP[key]] = posts[index][key] del posts[index][key] if "ts" in post: old_time = posts[index]["ts"] # '08/30/22(Tue)02:25:37' time_spl = old_time.split(":") if len(time_spl) == 3: old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S") else: old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M") new_ts = old_ts.isoformat() posts[index]["ts"] = new_ts if "msg" in post: soup = BeautifulSoup(posts[index]["msg"], "html.parser") msg = soup.get_text(separator="\n") posts[index]["msg"] = msg posts[index]["src"] = "4ch" # print({name_map[name]: val for name, val in post.items()}) #print(f"Got posts: {len(posts)}") await db.store_message_bulk(posts) async def fetch(self, url, session, mapped): async with session.get(url) as response: return (mapped, await response.json()) async def bound_fetch(self, sem, url, session, mapped): # Getter function with semaphore. async with sem: try: return await self.fetch(url, session, mapped) except: return (mapped, None) async def api_call(self, methods={}): headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" ) } tasks = [] sem = asyncio.Semaphore(100) connector = aiohttp.TCPConnector(limit=None) async with aiohttp.ClientSession(connector=connector) as session: for mapped, method in methods.items(): url = f"{self.api_endpoint}/{method}" self.log.debug(f"GET {url}") task = asyncio.create_task(self.bound_fetch(sem, url, session, mapped)) #task = asyncio.ensure_future(self.bound_fetch(sem, url, session)) tasks.append(task) responses = await asyncio.gather(*tasks) return responses