Reformat code

This commit is contained in:
2022-09-04 21:40:04 +01:00
parent 8feccbbf00
commit 20e22ae7ca
7 changed files with 50 additions and 130 deletions

View File

@@ -1,23 +1,22 @@
# Python modules can't start with a number...
import ujson
import asyncio
import random
import string
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
import aiohttp
import ujson
from bs4 import BeautifulSoup
from siphashc import siphash
import db
import util
from schemas.ch4_s import ATTRMAP
import aiohttp
import asyncio
from numpy import array_split
from math import ceil
from concurrent.futures import ProcessPoolExecutor
p = ProcessPoolExecutor(10)
class Chan4(object):
"""
4chan indexer, crawler and ingester.
@@ -28,12 +27,12 @@ class Chan4(object):
self.log = util.get_logger(name)
self.api_endpoint = "https://a.4cdn.org"
#self.boards = ["out", "g", "a", "3", "pol"] #
# self.boards = ["out", "g", "a", "3", "pol"] #
self.boards = []
self.thread_list = {}
#self.thread_deferreds = []
#self.content_deferreds = []
# self.thread_deferreds = []
# self.content_deferreds = []
self.log.info(f"Starting crawler bot to {self.api_endpoint}")
@@ -82,7 +81,10 @@ class Chan4(object):
await self.get_thread_lists(self.boards)
async def get_threads_content(self, thread_list):
thread_urls = {(board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list}
thread_urls = {
(board, thread): f"{board}/thread/{thread}.json"
for board, thread in thread_list
}
self.log.debug(f"Getting information for threads: {thread_urls}")
responses = await self.api_call(thread_urls)
self.log.debug(f"Got information for threads: {thread_urls}")
@@ -101,8 +103,8 @@ class Chan4(object):
# with futures.ThreadPoolExecutor(max_workers=6) as executor:
# print("SUBMITTED THREAD FOR", len(posts))
# executor.submit(self.handle_posts, board, thread, posts)
#await self.handle_posts(board, thread, response["posts"])
#await asyncio.sleep(1)
# await self.handle_posts(board, thread, response["posts"])
# await asyncio.sleep(1)
await self.handle_posts_thread(all_posts)
@asyncio.coroutine
@@ -158,7 +160,7 @@ class Chan4(object):
to_store.append(posts[key][index])
# print({name_map[name]: val for name, val in post.items()})
#print(f"Got posts: {len(posts)}")
# print(f"Got posts: {len(posts)}")
print("HANDLE POSTS DONE")
db.store_message_bulk(to_store)
print("STORE DB DONE")
@@ -167,26 +169,20 @@ class Chan4(object):
async with session.get(url) as response:
try:
return (mapped, await response.json())
except:
except: # noqa
print("FETCH ERROR")
return (mapped, None)
async def bound_fetch(self, sem, url, session, mapped):
# Getter function with semaphore.
async with sem:
try:
return await self.fetch(url, session, mapped)
except:
except: # noqa
print("BOUND ERROR")
return (mapped, None)
async def api_call(self, methods={}):
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
)
}
tasks = []
sem = asyncio.Semaphore(100)
connector = aiohttp.TCPConnector(limit=None)
@@ -195,8 +191,7 @@ class Chan4(object):
url = f"{self.api_endpoint}/{method}"
self.log.debug(f"GET {url}")
task = asyncio.create_task(self.bound_fetch(sem, url, session, mapped))
#task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
# task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
return responses

View File

@@ -5,7 +5,6 @@ import discord
import db
import util
from schemas.dis_s import ATTRMAP