# Python modules can't start with a number... import asyncio import random import string from concurrent.futures import ProcessPoolExecutor from datetime import datetime from math import ceil import aiohttp import ujson from bs4 import BeautifulSoup from numpy import array_split from siphashc import siphash import db import util from schemas.ch4_s import ATTRMAP # CONFIGURATION # # Number of 4chan threads to request at once THREADS_CONCURRENT = 1000 # Seconds to wait between every THREADS_CONCURRENT requests THREADS_DELAY = 0.1 # Seconds to wait between crawls CRAWL_DELAY = 5 # Semaphore value ? THREADS_SEMAPHORE = 1000 # Maximum number of CPU threads to use for post processing CPU_THREADS = 8 # CONFIGURATION END # p = ProcessPoolExecutor(CPU_THREADS) class Chan4(object): """ 4chan indexer, crawler and ingester. """ def __init__(self): name = self.__class__.__name__ self.log = util.get_logger(name) self.api_endpoint = "https://a.4cdn.org" # self.boards = ["out", "g", "a", "3", "pol"] # self.boards = [] # self.thread_deferreds = [] # self.content_deferreds = [] self.log.info(f"Starting crawler bot to {self.api_endpoint}") self.hash_key = db.r.get("hashing_key") if not self.hash_key: letters = string.ascii_lowercase self.hash_key = "".join(random.choice(letters) for i in range(16)) self.log.debug(f"Created new hash key: {self.hash_key}") db.r.set("hashing_key", self.hash_key) else: self.hash_key = self.hash_key.decode("ascii") self.log.debug(f"Decoded hash key: {self.hash_key}") async def run(self): await self.get_board_list() while True: await self.get_thread_lists(self.boards) await asyncio.sleep(CRAWL_DELAY) async def get_board_list(self): responses = await self.api_call({"_": "boards.json"}) for mapped, response in responses: if not response: continue for board in response["boards"]: self.boards.append(board["board"]) self.log.debug(f"Got boards: {self.boards}") async def get_thread_lists(self, boards): self.log.debug(f"Getting thread list for {boards}") board_urls = {board: f"{board}/catalog.json" for board in boards} responses = await self.api_call(board_urls) to_get = [] for mapped, response in responses: if not response: continue for page in response: for threads in page["threads"]: no = threads["no"] to_get.append((mapped, no)) self.log.debug(f"Got thread list for {mapped}: {len(response)}") if not to_get: return split_threads = array_split(to_get, ceil(len(to_get) / THREADS_CONCURRENT)) for threads in split_threads: await self.get_threads_content(threads) await asyncio.sleep(THREADS_DELAY) # await self.get_threads_content(to_get) def take_items(self, dict_list, n): i = 0 try: for x in list(dict_list.keys()): for item in list(dict_list[x]): yield (x, item) dict_list[x].remove(item) i += 1 if i == n: raise StopIteration except StopIteration: print("Take items took", i, "items") async def get_threads_content(self, thread_list): thread_urls = { (board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list } self.log.debug(f"Getting information for threads: {thread_urls}") responses = await self.api_call(thread_urls) self.log.debug(f"Got information for threads: {thread_urls}") all_posts = {} for mapped, response in responses: if not response: continue board, thread = mapped self.log.debug(f"Got thread content for thread {thread} on board {board}") all_posts[mapped] = response["posts"] # Split into 10,000 chunks if not all_posts: return self.handle_posts(all_posts) # threads_per_core = int(len(all_posts) / CPU_THREADS) # for i in range(CPU_THREADS): # new_dict = {} # pulled_posts = self.take_items(all_posts, threads_per_core) # for k, v in pulled_posts: # if k in new_dict: # new_dict[k].append(v) # else: # new_dict[k] = [v] #await self.handle_posts_thread(new_dict) # print("VAL", ceil(len(all_posts) / threads_per_core)) # split_posts = array_split(all_posts, ceil(len(all_posts) / threads_per_core)) # print("THREADS PER CORE SPLIT", len(split_posts)) # # print("SPLIT CHUNK", len(split_posts)) # for posts in split_posts: # print("SPAWNED THREAD TO PROCESS", len(posts), "POSTS") # await self.handle_posts_thread(posts) # await self.handle_posts_thread(all_posts) @asyncio.coroutine def handle_posts_thread(self, posts): loop = asyncio.get_event_loop() yield from loop.run_in_executor(p, self.handle_posts, posts) async def handle_posts(self, posts): to_store = [] for key, post_list in posts.items(): board, thread = key for index, post in enumerate(post_list): posts[key][index]["type"] = "msg" # # Calculate hash for post # post_normalised = ujson.dumps(post, sort_keys=True) # hash = siphash(self.hash_key, post_normalised) # hash = str(hash) # redis_key = f"cache.{board}.{thread}.{post['no']}" # key_content = db.r.get(redis_key) # if key_content: # key_content = key_content.decode("ascii") # if key_content == hash: # continue # else: # posts[key][index]["type"] = "update" # #db.r.set(redis_key, hash) # for key2, value in list(post.items()): # if key2 in ATTRMAP: # post[ATTRMAP[key2]] = posts[key][index][key2] # del posts[key][index][key2] # if "ts" in post: # old_time = posts[key][index]["ts"] # # '08/30/22(Tue)02:25:37' # time_spl = old_time.split(":") # if len(time_spl) == 3: # old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S") # else: # old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M") # # new_ts = old_ts.isoformat() # new_ts = int(old_ts.timestamp()) # posts[key][index]["ts"] = new_ts # if "msg" in post: # soup = BeautifulSoup(posts[key][index]["msg"], "html.parser") # msg = soup.get_text(separator="\n") # posts[key][index]["msg"] = msg posts[key][index]["src"] = "4ch" posts[key][index]["net"] = board posts[key][index]["channel"] = thread to_store.append(posts[key][index]) # print({name_map[name]: val for name, val in post.items()}) # print(f"Got posts: {len(posts)}") if to_store: print("STORING", len(to_store)) await db.queue_message_bulk(to_store) async def fetch(self, url, session, mapped): async with session.get(url) as response: try: return (mapped, await response.json()) except: # noqa return (mapped, None) async def bound_fetch(self, sem, url, session, mapped): # Getter function with semaphore. async with sem: try: return await self.fetch(url, session, mapped) except: # noqa return (mapped, None) async def api_call(self, methods={}): tasks = [] sem = asyncio.Semaphore(THREADS_SEMAPHORE) connector = aiohttp.TCPConnector(limit=None) async with aiohttp.ClientSession(connector=connector) as session: for mapped, method in methods.items(): url = f"{self.api_endpoint}/{method}" self.log.debug(f"GET {url}") task = asyncio.create_task(self.bound_fetch(sem, url, session, mapped)) # task = asyncio.ensure_future(self.bound_fetch(sem, url, session)) tasks.append(task) responses = await asyncio.gather(*tasks) return responses