Improve memory usage and fix 4chan crawler

This commit is contained in:
2022-10-21 07:20:30 +01:00
parent 2d7b6268dd
commit 51a9b2af79
6 changed files with 87 additions and 48 deletions

View File

@@ -1,8 +1,6 @@
import asyncio
from os import getenv
import orjson
import db
import util
from processing import process
@@ -20,6 +18,7 @@ class Ingest(object):
def __init__(self):
name = self.__class__.__name__
self.log = util.get_logger(name)
self.current_chunk = 0
self.log.info(
(
"Starting ingest handler for chunk size of "
@@ -30,20 +29,14 @@ class Ingest(object):
async def run(self):
while True:
await self.get_chunk()
self.log.debug(f"Ingest chunk {self.current_chunk} complete")
self.current_chunk += 1
await asyncio.sleep(ITER_DELAY)
async def get_chunk(self):
items = []
# for source in SOURCES:
# key = f"{KEYPREFIX}{source}"
length = await db.ar.llen(KEYNAME)
start_num = length - CHUNK_SIZE
chunk = await db.ar.lrange(KEYNAME, start_num, -1)
# chunk = await db.ar.rpop(KEYNAME, CHUNK_SIZE)
if not chunk:
if length > CHUNK_SIZE:
length = CHUNK_SIZE
if not length:
return
for item in chunk:
item = orjson.loads(item)
items.append(item)
if items:
await process.spawn_processing_threads(items)
await process.spawn_processing_threads(self.current_chunk, length)