# Python modules can't start with a number... import ujson import random import string from datetime import datetime from bs4 import BeautifulSoup from siphashc import siphash import db import util from schemas.ch4_s import ATTRMAP import aiohttp import asyncio from numpy import array_split from math import ceil from concurrent.futures import ProcessPoolExecutor p = ProcessPoolExecutor(10) class Chan4(object): """ 4chan indexer, crawler and ingester. """ def __init__(self): name = self.__class__.__name__ self.log = util.get_logger(name) self.api_endpoint = "https://a.4cdn.org" #self.boards = ["out", "g", "a", "3", "pol"] # self.boards = [] self.thread_list = {} #self.thread_deferreds = [] #self.content_deferreds = [] self.log.info(f"Starting crawler bot to {self.api_endpoint}") self.hash_key = db.r.get("hashing_key") if not self.hash_key: letters = string.ascii_lowercase self.hash_key = "".join(random.choice(letters) for i in range(16)) self.log.debug(f"Created new hash key: {self.hash_key}") db.r.set("hashing_key", self.hash_key) else: self.hash_key = self.hash_key.decode("ascii") self.log.debug(f"Decoded hash key: {self.hash_key}") async def run(self): await self.get_board_list() async def get_board_list(self): responses = await self.api_call({"_": "boards.json"}) for mapped, response in responses: if not response: continue for board in response["boards"]: self.boards.append(board["board"]) self.log.debug(f"Got boards: {self.boards}") await self.get_thread_lists(self.boards) async def get_thread_lists(self, boards): self.log.debug(f"Getting thread list for {boards}") board_urls = {board: f"{board}/catalog.json" for board in boards} responses = await self.api_call(board_urls) to_get = [] for mapped, response in responses: if not response: continue for page in response: for threads in page["threads"]: no = threads["no"] to_get.append((mapped, no)) self.log.info(f"Got thread list for {mapped}: {len(response)}") await self.get_threads_content(to_get) # Recurse await self.get_thread_lists(self.boards) async def get_threads_content(self, thread_list): thread_urls = {(board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list} self.log.debug(f"Getting information for threads: {thread_urls}") responses = await self.api_call(thread_urls) self.log.debug(f"Got information for threads: {thread_urls}") all_posts = {} for mapped, response in responses: if not response: continue board, thread = mapped self.log.debug(f"Got thread content for thread {thread} on board {board}") all_posts[mapped] = response["posts"] # Split into 10,000 chunks # split_posts = array_split(all_posts, ceil(len(all_posts) / 10000)) # print("SPLIT CHUNK", len(split_posts)) # for posts in split_posts: # with futures.ThreadPoolExecutor(max_workers=6) as executor: # print("SUBMITTED THREAD FOR", len(posts)) # executor.submit(self.handle_posts, board, thread, posts) #await self.handle_posts(board, thread, response["posts"]) #await asyncio.sleep(1) await self.handle_posts_thread(all_posts) @asyncio.coroutine def handle_posts_thread(self, posts): loop = asyncio.get_event_loop() print("HANDLE POSTS THREAD", len(posts.keys())) yield from loop.run_in_executor(p, self.handle_posts, posts) def handle_posts(self, posts): print("HANDLE POSTS START") to_store = [] for key, post_list in posts.items(): board, thread = key print("PROCESSING BOARD", board, "THREAD", thread) print("POSTS HERE", len(post_list)) for index, post in enumerate(post_list): posts[key][index]["type"] = "msg" # Calculate hash for post post_normalised = ujson.dumps(post, sort_keys=True) hash = siphash(self.hash_key, post_normalised) hash = str(hash) redis_key = f"cache.{board}.{thread}.{post['no']}" key_content = db.r.get(redis_key) if key_content: key_content = key_content.decode("ascii") if key_content == hash: continue else: posts[key][index]["type"] = "update" #db.r.set(redis_key, hash) for key2, value in list(post.items()): if key2 in ATTRMAP: post[ATTRMAP[key2]] = posts[key][index][key2] del posts[key][index][key2] if "ts" in post: old_time = posts[key][index]["ts"] # '08/30/22(Tue)02:25:37' time_spl = old_time.split(":") if len(time_spl) == 3: old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S") else: old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M") new_ts = old_ts.isoformat() posts[key][index]["ts"] = new_ts if "msg" in post: soup = BeautifulSoup(posts[key][index]["msg"], "html.parser") msg = soup.get_text(separator="\n") posts[key][index]["msg"] = msg posts[key][index]["src"] = "4ch" to_store.append(posts[key][index]) # print({name_map[name]: val for name, val in post.items()}) #print(f"Got posts: {len(posts)}") print("HANDLE POSTS DONE") db.store_message_bulk(to_store) print("STORE DB DONE") async def fetch(self, url, session, mapped): async with session.get(url) as response: try: return (mapped, await response.json()) except: print("FETCH ERROR") return (mapped, None) async def bound_fetch(self, sem, url, session, mapped): # Getter function with semaphore. async with sem: try: return await self.fetch(url, session, mapped) except: print("BOUND ERROR") return (mapped, None) async def api_call(self, methods={}): headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" ) } tasks = [] sem = asyncio.Semaphore(100) connector = aiohttp.TCPConnector(limit=None) async with aiohttp.ClientSession(connector=connector) as session: for mapped, method in methods.items(): url = f"{self.api_endpoint}/{method}" self.log.debug(f"GET {url}") task = asyncio.create_task(self.bound_fetch(sem, url, session, mapped)) #task = asyncio.ensure_future(self.bound_fetch(sem, url, session)) tasks.append(task) responses = await asyncio.gather(*tasks) return responses