Implement running Discord and 4chan gathering simultaneously

This commit is contained in:
2022-09-02 22:30:45 +01:00
commit 734a2b7879
12 changed files with 532 additions and 0 deletions

216
sources/ch4.py Normal file
View File

@@ -0,0 +1,216 @@
# Python modules can't start with a number...
import json
import random
import string
from datetime import datetime
from typing import Any, Dict
import treq
from bs4 import BeautifulSoup
from siphashc import siphash
from twisted.internet.defer import inlineCallbacks
import db
import util
class Chan4(object):
"""
4chan indexer, crawler and ingester.
"""
def __init__(self):
name = self.__class__.__name__
self.log = util.get_logger(name)
self.api_endpoint = "https://a.4cdn.org"
self.boards = []
self.thread_list = {}
self.thread_deferreds = []
self.log.info(f"Starting crawler bot to {self.api_endpoint}")
self.hash_key = db.r.get("hashing_key")
if not self.hash_key:
letters = string.ascii_lowercase
self.hash_key = "".join(random.choice(letters) for i in range(16))
self.log.debug(f"Created new hash key: {self.hash_key}")
db.r.set("hashing_key", self.hash_key)
else:
self.hash_key = self.hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {self.hash_key}")
@inlineCallbacks
def run(self):
yield self.get_board_list()
yield self.get_thread_lists()
yield self.get_thread_contents()
def get_board_list(self):
self.log.info("Getting board list")
response = self.api_call("boards.json")
response.addCallback(self.got_board_list)
return response
def got_board_list(self, board_list):
if board_list["success"]:
for board in board_list["response"]["boards"]:
self.boards.append(board["board"])
@inlineCallbacks
def get_thread_lists(self):
for board in self.boards:
yield self.get_thread_list(board)
# self.thread_deferreds.append(d)
# yield defer.gatherResults(self.thread_deferreds)
# self.thread_deferreds = []
# self.log.info("Finished getting thread lists")
@inlineCallbacks
def get_thread_contents(self):
for board in self.thread_list.keys():
for page in self.thread_list[board]:
for threads in page["threads"]:
no = threads["no"]
yield self.get_thread_content(board, no)
# self.content_deferreds.append(d)
# al = yield defer.gatherResults(self.content_deferreds)
# self.content_deferreds = []
# self.log.info("Finished getting content")
def get_thread_list(self, board):
self.log.info(f"Getting thread list for {board}")
response = self.api_call(f"{board}/catalog.json")
response.addCallback(self.got_thread_list, board)
return response
def got_thread_list(self, thread_list, board):
if thread_list["success"]:
self.thread_list[board] = thread_list["response"]
self.log.info(f"Got thread list for {board}: {len(thread_list)}")
def get_thread_content(self, board, thread):
self.log.info(f"Getting information for thread {thread} on board {board}")
response = self.api_call(f"{board}/thread/{thread}.json")
response.addCallback(self.got_thread_content, board, thread)
return response
def got_thread_content(self, thread_content, board, thread):
if thread_content["success"]:
self.log.info(f"Got thread content for thread {thread} on board {board}")
for post in thread_content["response"]["posts"]:
# print(post)
self.handle_post(board, thread, post)
else:
self.log.error(
(
f"Error fetching thread {thread} on board {board}: "
f"{thread_content['message']}"
)
)
def handle_post(self, board, thread, post):
name_map = {
"no": "msg_id",
"now": "ts",
"name": "user",
"trip": "nick",
"id": "nick_id",
"resto": "id_reply",
"com": "msg",
"ext": "file_ext",
"w": "file_w",
"h": "file_h",
"tn_w": "file_tn_w",
"tn_h": "file_tn_h",
"tim": "file_tim",
"fsize": "file_size",
"md5": "file_md5",
"filedeleted": "file_deleted",
"spoiler": "file_spoiler",
"custom_spoiler": "file_custom_spoiler",
"m_img": "file_m_img",
"time": "unix_time",
}
post["type"] = "msg"
# Calculate hash for post
post_normalised = json.dumps(post, sort_keys=True)
hash = siphash(self.hash_key, post_normalised)
hash = str(hash)
redis_key = f"cache.{board}.{thread}.{post['no']}"
key_content = db.r.get(redis_key)
if key_content:
key_content = key_content.decode("ascii")
if key_content == hash:
return
else:
post["type"] = "update"
db.r.set(redis_key, hash)
# Check if hash exists
# Store the hash
for key, value in list(post.items()):
if key in name_map:
post[name_map[key]] = post[key]
del post[key]
if "ts" in post:
old_time = post["ts"]
# '08/30/22(Tue)02:25:37'
time_spl = old_time.split(":")
if len(time_spl) == 3:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
else:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
new_ts = old_ts.isoformat()
post["ts"] = new_ts
if "msg" in post:
soup = BeautifulSoup(post["msg"], "html.parser")
msg = soup.get_text(separator="\n")
post["msg"] = msg
post["src"] = "4ch"
# print({name_map[name]: val for name, val in post.items()})
db.store_message(post)
@inlineCallbacks
def callback_api_call(self, response, result):
try:
text = yield response.content()
except: # noqa
self.log.error("Error with API call")
return
try:
result["response"] = json.loads(text)
except json.decoder.JSONDecodeError:
result["success"] = "ERROR"
result["message"] = "Error parsing JSON."
return result
result["status"] = response.code
if response.code == 200:
result["success"] = True
result["message"] = "OK"
else:
result["message"] = "API ERROR"
return result
def api_call(self, method: str):
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
)
}
url = f"{self.api_endpoint}/{method}"
self.log.debug(f"GET {url}")
response = treq.get(url, headers=headers)
result: Dict[str, Any] = {
"success": False,
"message": "Invalid Method",
"response": None,
"status": None,
}
response.addCallback(self.callback_api_call, result)
return response

62
sources/dis.py Normal file
View File

@@ -0,0 +1,62 @@
#!/usr/bin/env python
from operator import attrgetter
import discord
import db
import util
ATTRMAP = {
"msg": "content",
"msg_id": "id",
"nick": "author.name",
"host": "author.discriminator",
"ident": "author.nick",
"time": "created_at",
"channel": "channel.name",
"channel_nsfw": "channel.nsfw",
"bot": "author.bot",
"user_id": "author.id",
"channel_id": "channel.id",
"net": "author.guild.name",
"net_id": "author.guild.id",
"guild_member_count": "author.guild.member_count",
"channel_category": "channel.category.name",
"channel_category_id": "channel.category.id",
"channel_category_nsfw": "channel.category.nsfw",
}
class DiscordClient(discord.Client):
def __init__(self, *args, **kwargs):
self.logger = None
self.did_something = False
name = self.__class__.__name__
self.log = util.get_logger(name)
super().__init__(*args, **kwargs)
async def on_ready(self):
self.log.info("Discord connection established.")
def recurse_dict(self, obj):
to_return = {}
for key, mapped in ATTRMAP.items():
try:
to_return[key] = attrgetter(mapped)(obj)
except AttributeError:
continue
return to_return
async def on_message(self, message):
if not message.content:
return
a = self.recurse_dict(message)
a["ts"] = a["time"].isoformat()
del a["time"]
a["type"] = "msg"
a["src"] = "dis"
db.store_message(a)