|
|
|
@ -8,11 +8,10 @@ from typing import Any, Dict
|
|
|
|
|
import treq
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from siphashc import siphash
|
|
|
|
|
from twisted.internet.defer import inlineCallbacks
|
|
|
|
|
|
|
|
|
|
import db
|
|
|
|
|
import util
|
|
|
|
|
|
|
|
|
|
from schemas.ch4_s import ATTRMAP
|
|
|
|
|
|
|
|
|
|
class Chan4(object):
|
|
|
|
|
"""
|
|
|
|
@ -24,10 +23,11 @@ class Chan4(object):
|
|
|
|
|
self.log = util.get_logger(name)
|
|
|
|
|
|
|
|
|
|
self.api_endpoint = "https://a.4cdn.org"
|
|
|
|
|
self.boards = []
|
|
|
|
|
self.boards = ["out"]
|
|
|
|
|
self.thread_list = {}
|
|
|
|
|
|
|
|
|
|
self.thread_deferreds = []
|
|
|
|
|
#self.thread_deferreds = []
|
|
|
|
|
#self.content_deferreds = []
|
|
|
|
|
|
|
|
|
|
self.log.info(f"Starting crawler bot to {self.api_endpoint}")
|
|
|
|
|
|
|
|
|
@ -45,61 +45,74 @@ class Chan4(object):
|
|
|
|
|
@inlineCallbacks
|
|
|
|
|
def run(self):
|
|
|
|
|
yield self.get_board_list()
|
|
|
|
|
yield self.get_thread_lists()
|
|
|
|
|
yield self.get_thread_contents()
|
|
|
|
|
|
|
|
|
|
def got_thread_lists(self, thread_lists):
|
|
|
|
|
print("GOT THREAD LIST", thread_lists)
|
|
|
|
|
# Instead of while True, do it again!
|
|
|
|
|
d = self.get_thread_lists()
|
|
|
|
|
d.addCallback(self.got_thread_lists)
|
|
|
|
|
# @inlineCallbacks
|
|
|
|
|
# def mainloop(self):
|
|
|
|
|
# while True:
|
|
|
|
|
# yield self.get_thread_lists()
|
|
|
|
|
# yield self.get_thread_contents()
|
|
|
|
|
|
|
|
|
|
@inlineCallbacks
|
|
|
|
|
def get_board_list(self):
|
|
|
|
|
self.log.info("Getting board list")
|
|
|
|
|
self.log.debug("Getting board list")
|
|
|
|
|
response = self.api_call("boards.json")
|
|
|
|
|
response.addCallback(self.got_board_list)
|
|
|
|
|
return response
|
|
|
|
|
yield response
|
|
|
|
|
|
|
|
|
|
@inlineCallbacks
|
|
|
|
|
def got_board_list(self, board_list):
|
|
|
|
|
if board_list["success"]:
|
|
|
|
|
for board in board_list["response"]["boards"]:
|
|
|
|
|
self.boards.append(board["board"])
|
|
|
|
|
self.log.debug(f"Got boards: {self.boards}")
|
|
|
|
|
d = self.get_thread_lists()
|
|
|
|
|
d.addCallback(self.got_thread_lists)
|
|
|
|
|
yield d
|
|
|
|
|
|
|
|
|
|
@inlineCallbacks
|
|
|
|
|
def get_thread_lists(self):
|
|
|
|
|
thread_deferreds = []
|
|
|
|
|
for board in self.boards:
|
|
|
|
|
yield self.get_thread_list(board)
|
|
|
|
|
# self.thread_deferreds.append(d)
|
|
|
|
|
# yield defer.gatherResults(self.thread_deferreds)
|
|
|
|
|
# self.thread_deferreds = []
|
|
|
|
|
# self.log.info("Finished getting thread lists")
|
|
|
|
|
|
|
|
|
|
@inlineCallbacks
|
|
|
|
|
def get_thread_contents(self):
|
|
|
|
|
for board in self.thread_list.keys():
|
|
|
|
|
for page in self.thread_list[board]:
|
|
|
|
|
for threads in page["threads"]:
|
|
|
|
|
no = threads["no"]
|
|
|
|
|
yield self.get_thread_content(board, no)
|
|
|
|
|
# self.content_deferreds.append(d)
|
|
|
|
|
# al = yield defer.gatherResults(self.content_deferreds)
|
|
|
|
|
# self.content_deferreds = []
|
|
|
|
|
# self.log.info("Finished getting content")
|
|
|
|
|
d = self.get_thread_list(board)
|
|
|
|
|
d.addCallback(self.got_thread_list, board)
|
|
|
|
|
thread_deferreds.append(d)
|
|
|
|
|
|
|
|
|
|
yield defer.gatherResults(thread_deferreds)
|
|
|
|
|
|
|
|
|
|
def get_thread_list(self, board):
|
|
|
|
|
self.log.info(f"Getting thread list for {board}")
|
|
|
|
|
self.log.debug(f"Getting thread list for {board}")
|
|
|
|
|
response = self.api_call(f"{board}/catalog.json")
|
|
|
|
|
response.addCallback(self.got_thread_list, board)
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
def got_thread_list(self, thread_list, board):
|
|
|
|
|
if not thread_list:
|
|
|
|
|
self.log.error(f"Thread list invalid: {thread_list} {board}")
|
|
|
|
|
return
|
|
|
|
|
if thread_list["success"]:
|
|
|
|
|
self.thread_list[board] = thread_list["response"]
|
|
|
|
|
#self.thread_list[board] = thread_list["response"]
|
|
|
|
|
for page in thread_list["response"]:
|
|
|
|
|
for threads in page["threads"]:
|
|
|
|
|
no = threads["no"]
|
|
|
|
|
d = self.get_thread_content(board, no)
|
|
|
|
|
d.addCallback(self.got_thread_content, board, no)
|
|
|
|
|
self.log.info(f"Got thread list for {board}: {len(thread_list)}")
|
|
|
|
|
|
|
|
|
|
def get_thread_content(self, board, thread):
|
|
|
|
|
self.log.info(f"Getting information for thread {thread} on board {board}")
|
|
|
|
|
self.log.debug(f"Getting information for thread {thread} on board {board}")
|
|
|
|
|
response = self.api_call(f"{board}/thread/{thread}.json")
|
|
|
|
|
response.addCallback(self.got_thread_content, board, thread)
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
def got_thread_content(self, thread_content, board, thread):
|
|
|
|
|
if not thread_content:
|
|
|
|
|
self.log.error(f"Thread content invalid: {thread_content} {board} {thread}")
|
|
|
|
|
return
|
|
|
|
|
if thread_content["success"]:
|
|
|
|
|
self.log.info(f"Got thread content for thread {thread} on board {board}")
|
|
|
|
|
self.log.debug(f"Got thread content for thread {thread} on board {board}")
|
|
|
|
|
for post in thread_content["response"]["posts"]:
|
|
|
|
|
# print(post)
|
|
|
|
|
self.handle_post(board, thread, post)
|
|
|
|
@ -112,28 +125,6 @@ class Chan4(object):
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def handle_post(self, board, thread, post):
|
|
|
|
|
name_map = {
|
|
|
|
|
"no": "msg_id",
|
|
|
|
|
"now": "ts",
|
|
|
|
|
"name": "user",
|
|
|
|
|
"trip": "nick",
|
|
|
|
|
"id": "nick_id",
|
|
|
|
|
"resto": "id_reply",
|
|
|
|
|
"com": "msg",
|
|
|
|
|
"ext": "file_ext",
|
|
|
|
|
"w": "file_w",
|
|
|
|
|
"h": "file_h",
|
|
|
|
|
"tn_w": "file_tn_w",
|
|
|
|
|
"tn_h": "file_tn_h",
|
|
|
|
|
"tim": "file_tim",
|
|
|
|
|
"fsize": "file_size",
|
|
|
|
|
"md5": "file_md5",
|
|
|
|
|
"filedeleted": "file_deleted",
|
|
|
|
|
"spoiler": "file_spoiler",
|
|
|
|
|
"custom_spoiler": "file_custom_spoiler",
|
|
|
|
|
"m_img": "file_m_img",
|
|
|
|
|
"time": "unix_time",
|
|
|
|
|
}
|
|
|
|
|
post["type"] = "msg"
|
|
|
|
|
|
|
|
|
|
# Calculate hash for post
|
|
|
|
@ -152,8 +143,8 @@ class Chan4(object):
|
|
|
|
|
# Check if hash exists
|
|
|
|
|
# Store the hash
|
|
|
|
|
for key, value in list(post.items()):
|
|
|
|
|
if key in name_map:
|
|
|
|
|
post[name_map[key]] = post[key]
|
|
|
|
|
if key in ATTRMAP:
|
|
|
|
|
post[ATTRMAP[key]] = post[key]
|
|
|
|
|
del post[key]
|
|
|
|
|
if "ts" in post:
|
|
|
|
|
old_time = post["ts"]
|
|
|
|
@ -175,19 +166,25 @@ class Chan4(object):
|
|
|
|
|
# print({name_map[name]: val for name, val in post.items()})
|
|
|
|
|
db.store_message(post)
|
|
|
|
|
|
|
|
|
|
def dump(self, *args, **kwargs):
|
|
|
|
|
self.log.error(f"Error: {args} {kwargs}")
|
|
|
|
|
|
|
|
|
|
@inlineCallbacks
|
|
|
|
|
def callback_api_call(self, response, result):
|
|
|
|
|
result["status"] = response.code
|
|
|
|
|
try:
|
|
|
|
|
text = yield response.content()
|
|
|
|
|
except: # noqa
|
|
|
|
|
self.log.error("Error with API call")
|
|
|
|
|
return
|
|
|
|
|
return False
|
|
|
|
|
#print("RESP TEXT", text)
|
|
|
|
|
try:
|
|
|
|
|
result["response"] = json.loads(text)
|
|
|
|
|
except json.decoder.JSONDecodeError:
|
|
|
|
|
result["success"] = "ERROR"
|
|
|
|
|
result["message"] = "Error parsing JSON."
|
|
|
|
|
return result
|
|
|
|
|
#print("RESP AFTER JSON", result)
|
|
|
|
|
result["status"] = response.code
|
|
|
|
|
if response.code == 200:
|
|
|
|
|
result["success"] = True
|
|
|
|
@ -208,9 +205,10 @@ class Chan4(object):
|
|
|
|
|
response = treq.get(url, headers=headers)
|
|
|
|
|
result: Dict[str, Any] = {
|
|
|
|
|
"success": False,
|
|
|
|
|
"message": "Invalid Method",
|
|
|
|
|
"message": "Call not successful",
|
|
|
|
|
"response": None,
|
|
|
|
|
"status": None,
|
|
|
|
|
}
|
|
|
|
|
response.addCallback(self.callback_api_call, result)
|
|
|
|
|
response.addErrback(self.dump, url=url)
|
|
|
|
|
return response
|
|
|
|
|