Improve memory usage and fix 4chan crawler
This commit is contained in:
@@ -74,26 +74,28 @@ class Chan4(object):
|
||||
|
||||
async def get_thread_lists(self, boards):
|
||||
# self.log.debug(f"Getting thread list for {boards}")
|
||||
board_urls = {board: f"{board}/catalog.json" for board in boards}
|
||||
board_urls = {board: f"{board}/threads.json" for board in boards}
|
||||
responses = await self.api_call(board_urls)
|
||||
to_get = []
|
||||
flat_map = [board for board, thread in responses]
|
||||
self.log.debug(f"Got thread list for {flat_map}: {len(responses)}")
|
||||
for mapped, response in responses:
|
||||
self.log.debug(f"Got thread list for {len(responses)} boards: {flat_map}")
|
||||
for board, response in responses:
|
||||
if not response:
|
||||
continue
|
||||
for page in response:
|
||||
for threads in page["threads"]:
|
||||
no = threads["no"]
|
||||
to_get.append((mapped, no))
|
||||
to_get.append((board, no))
|
||||
|
||||
if not to_get:
|
||||
return
|
||||
self.log.debug(f"Got {len(to_get)} threads to fetch")
|
||||
split_threads = array_split(to_get, ceil(len(to_get) / THREADS_CONCURRENT))
|
||||
for threads in split_threads:
|
||||
await self.get_threads_content(threads)
|
||||
self.log.debug(f"Split threads into {len(split_threads)} series")
|
||||
for index, thr in enumerate(split_threads):
|
||||
self.log.debug(f"Series {index} - getting {len(thr)} threads")
|
||||
await self.get_threads_content(thr)
|
||||
await asyncio.sleep(THREADS_DELAY)
|
||||
# await self.get_threads_content(to_get)
|
||||
|
||||
def take_items(self, dict_list, n):
|
||||
i = 0
|
||||
@@ -132,14 +134,14 @@ class Chan4(object):
|
||||
to_store = []
|
||||
for key, post_list in posts.items():
|
||||
board, thread = key
|
||||
for index, post in enumerate(post_list):
|
||||
posts[key][index]["type"] = "msg"
|
||||
for post in post_list:
|
||||
post["type"] = "msg"
|
||||
|
||||
posts[key][index]["src"] = "4ch"
|
||||
posts[key][index]["net"] = board
|
||||
posts[key][index]["channel"] = thread
|
||||
post["src"] = "4ch"
|
||||
post["net"] = board
|
||||
post["channel"] = thread
|
||||
|
||||
to_store.append(posts[key][index])
|
||||
to_store.append(post)
|
||||
|
||||
if to_store:
|
||||
await db.queue_message_bulk(to_store)
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import asyncio
|
||||
from os import getenv
|
||||
|
||||
import orjson
|
||||
|
||||
import db
|
||||
import util
|
||||
from processing import process
|
||||
@@ -20,6 +18,7 @@ class Ingest(object):
|
||||
def __init__(self):
|
||||
name = self.__class__.__name__
|
||||
self.log = util.get_logger(name)
|
||||
self.current_chunk = 0
|
||||
self.log.info(
|
||||
(
|
||||
"Starting ingest handler for chunk size of "
|
||||
@@ -30,20 +29,14 @@ class Ingest(object):
|
||||
async def run(self):
|
||||
while True:
|
||||
await self.get_chunk()
|
||||
self.log.debug(f"Ingest chunk {self.current_chunk} complete")
|
||||
self.current_chunk += 1
|
||||
await asyncio.sleep(ITER_DELAY)
|
||||
|
||||
async def get_chunk(self):
|
||||
items = []
|
||||
# for source in SOURCES:
|
||||
# key = f"{KEYPREFIX}{source}"
|
||||
length = await db.ar.llen(KEYNAME)
|
||||
start_num = length - CHUNK_SIZE
|
||||
chunk = await db.ar.lrange(KEYNAME, start_num, -1)
|
||||
# chunk = await db.ar.rpop(KEYNAME, CHUNK_SIZE)
|
||||
if not chunk:
|
||||
if length > CHUNK_SIZE:
|
||||
length = CHUNK_SIZE
|
||||
if not length:
|
||||
return
|
||||
for item in chunk:
|
||||
item = orjson.loads(item)
|
||||
items.append(item)
|
||||
if items:
|
||||
await process.spawn_processing_threads(items)
|
||||
await process.spawn_processing_threads(self.current_chunk, length)
|
||||
|
||||
Reference in New Issue
Block a user