You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

217 lines
7.1 KiB
Python

# Python modules can't start with a number...
import json
import random
import string
from datetime import datetime
from typing import Any, Dict
import treq
from bs4 import BeautifulSoup
from siphashc import siphash
from twisted.internet.defer import inlineCallbacks
import db
import util
class Chan4(object):
"""
4chan indexer, crawler and ingester.
"""
def __init__(self):
name = self.__class__.__name__
self.log = util.get_logger(name)
self.api_endpoint = "https://a.4cdn.org"
self.boards = []
self.thread_list = {}
self.thread_deferreds = []
self.log.info(f"Starting crawler bot to {self.api_endpoint}")
self.hash_key = db.r.get("hashing_key")
if not self.hash_key:
letters = string.ascii_lowercase
self.hash_key = "".join(random.choice(letters) for i in range(16))
self.log.debug(f"Created new hash key: {self.hash_key}")
db.r.set("hashing_key", self.hash_key)
else:
self.hash_key = self.hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {self.hash_key}")
@inlineCallbacks
def run(self):
yield self.get_board_list()
yield self.get_thread_lists()
yield self.get_thread_contents()
def get_board_list(self):
self.log.info("Getting board list")
response = self.api_call("boards.json")
response.addCallback(self.got_board_list)
return response
def got_board_list(self, board_list):
if board_list["success"]:
for board in board_list["response"]["boards"]:
self.boards.append(board["board"])
@inlineCallbacks
def get_thread_lists(self):
for board in self.boards:
yield self.get_thread_list(board)
# self.thread_deferreds.append(d)
# yield defer.gatherResults(self.thread_deferreds)
# self.thread_deferreds = []
# self.log.info("Finished getting thread lists")
@inlineCallbacks
def get_thread_contents(self):
for board in self.thread_list.keys():
for page in self.thread_list[board]:
for threads in page["threads"]:
no = threads["no"]
yield self.get_thread_content(board, no)
# self.content_deferreds.append(d)
# al = yield defer.gatherResults(self.content_deferreds)
# self.content_deferreds = []
# self.log.info("Finished getting content")
def get_thread_list(self, board):
self.log.info(f"Getting thread list for {board}")
response = self.api_call(f"{board}/catalog.json")
response.addCallback(self.got_thread_list, board)
return response
def got_thread_list(self, thread_list, board):
if thread_list["success"]:
self.thread_list[board] = thread_list["response"]
self.log.info(f"Got thread list for {board}: {len(thread_list)}")
def get_thread_content(self, board, thread):
self.log.info(f"Getting information for thread {thread} on board {board}")
response = self.api_call(f"{board}/thread/{thread}.json")
response.addCallback(self.got_thread_content, board, thread)
return response
def got_thread_content(self, thread_content, board, thread):
if thread_content["success"]:
self.log.info(f"Got thread content for thread {thread} on board {board}")
for post in thread_content["response"]["posts"]:
# print(post)
self.handle_post(board, thread, post)
else:
self.log.error(
(
f"Error fetching thread {thread} on board {board}: "
f"{thread_content['message']}"
)
)
def handle_post(self, board, thread, post):
name_map = {
"no": "msg_id",
"now": "ts",
"name": "user",
"trip": "nick",
"id": "nick_id",
"resto": "id_reply",
"com": "msg",
"ext": "file_ext",
"w": "file_w",
"h": "file_h",
"tn_w": "file_tn_w",
"tn_h": "file_tn_h",
"tim": "file_tim",
"fsize": "file_size",
"md5": "file_md5",
"filedeleted": "file_deleted",
"spoiler": "file_spoiler",
"custom_spoiler": "file_custom_spoiler",
"m_img": "file_m_img",
"time": "unix_time",
}
post["type"] = "msg"
# Calculate hash for post
post_normalised = json.dumps(post, sort_keys=True)
hash = siphash(self.hash_key, post_normalised)
hash = str(hash)
redis_key = f"cache.{board}.{thread}.{post['no']}"
key_content = db.r.get(redis_key)
if key_content:
key_content = key_content.decode("ascii")
if key_content == hash:
return
else:
post["type"] = "update"
db.r.set(redis_key, hash)
# Check if hash exists
# Store the hash
for key, value in list(post.items()):
if key in name_map:
post[name_map[key]] = post[key]
del post[key]
if "ts" in post:
old_time = post["ts"]
# '08/30/22(Tue)02:25:37'
time_spl = old_time.split(":")
if len(time_spl) == 3:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
else:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
new_ts = old_ts.isoformat()
post["ts"] = new_ts
if "msg" in post:
soup = BeautifulSoup(post["msg"], "html.parser")
msg = soup.get_text(separator="\n")
post["msg"] = msg
post["src"] = "4ch"
# print({name_map[name]: val for name, val in post.items()})
db.store_message(post)
@inlineCallbacks
def callback_api_call(self, response, result):
try:
text = yield response.content()
except: # noqa
self.log.error("Error with API call")
return
try:
result["response"] = json.loads(text)
except json.decoder.JSONDecodeError:
result["success"] = "ERROR"
result["message"] = "Error parsing JSON."
return result
result["status"] = response.code
if response.code == 200:
result["success"] = True
result["message"] = "OK"
else:
result["message"] = "API ERROR"
return result
def api_call(self, method: str):
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
)
}
url = f"{self.api_endpoint}/{method}"
self.log.debug(f"GET {url}")
response = treq.get(url, headers=headers)
result: Dict[str, Any] = {
"success": False,
"message": "Invalid Method",
"response": None,
"status": None,
}
response.addCallback(self.callback_api_call, result)
return response