Improve memory usage and fix 4chan crawler
This commit is contained in:
@@ -74,26 +74,28 @@ class Chan4(object):
|
||||
|
||||
async def get_thread_lists(self, boards):
|
||||
# self.log.debug(f"Getting thread list for {boards}")
|
||||
board_urls = {board: f"{board}/catalog.json" for board in boards}
|
||||
board_urls = {board: f"{board}/threads.json" for board in boards}
|
||||
responses = await self.api_call(board_urls)
|
||||
to_get = []
|
||||
flat_map = [board for board, thread in responses]
|
||||
self.log.debug(f"Got thread list for {flat_map}: {len(responses)}")
|
||||
for mapped, response in responses:
|
||||
self.log.debug(f"Got thread list for {len(responses)} boards: {flat_map}")
|
||||
for board, response in responses:
|
||||
if not response:
|
||||
continue
|
||||
for page in response:
|
||||
for threads in page["threads"]:
|
||||
no = threads["no"]
|
||||
to_get.append((mapped, no))
|
||||
to_get.append((board, no))
|
||||
|
||||
if not to_get:
|
||||
return
|
||||
self.log.debug(f"Got {len(to_get)} threads to fetch")
|
||||
split_threads = array_split(to_get, ceil(len(to_get) / THREADS_CONCURRENT))
|
||||
for threads in split_threads:
|
||||
await self.get_threads_content(threads)
|
||||
self.log.debug(f"Split threads into {len(split_threads)} series")
|
||||
for index, thr in enumerate(split_threads):
|
||||
self.log.debug(f"Series {index} - getting {len(thr)} threads")
|
||||
await self.get_threads_content(thr)
|
||||
await asyncio.sleep(THREADS_DELAY)
|
||||
# await self.get_threads_content(to_get)
|
||||
|
||||
def take_items(self, dict_list, n):
|
||||
i = 0
|
||||
@@ -132,14 +134,14 @@ class Chan4(object):
|
||||
to_store = []
|
||||
for key, post_list in posts.items():
|
||||
board, thread = key
|
||||
for index, post in enumerate(post_list):
|
||||
posts[key][index]["type"] = "msg"
|
||||
for post in post_list:
|
||||
post["type"] = "msg"
|
||||
|
||||
posts[key][index]["src"] = "4ch"
|
||||
posts[key][index]["net"] = board
|
||||
posts[key][index]["channel"] = thread
|
||||
post["src"] = "4ch"
|
||||
post["net"] = board
|
||||
post["channel"] = thread
|
||||
|
||||
to_store.append(posts[key][index])
|
||||
to_store.append(post)
|
||||
|
||||
if to_store:
|
||||
await db.queue_message_bulk(to_store)
|
||||
|
||||
Reference in New Issue
Block a user