Improve memory usage and fix 4chan crawler
This commit is contained in:
@@ -1,8 +1,6 @@
|
||||
import asyncio
|
||||
from os import getenv
|
||||
|
||||
import orjson
|
||||
|
||||
import db
|
||||
import util
|
||||
from processing import process
|
||||
@@ -20,6 +18,7 @@ class Ingest(object):
|
||||
def __init__(self):
|
||||
name = self.__class__.__name__
|
||||
self.log = util.get_logger(name)
|
||||
self.current_chunk = 0
|
||||
self.log.info(
|
||||
(
|
||||
"Starting ingest handler for chunk size of "
|
||||
@@ -30,20 +29,14 @@ class Ingest(object):
|
||||
async def run(self):
|
||||
while True:
|
||||
await self.get_chunk()
|
||||
self.log.debug(f"Ingest chunk {self.current_chunk} complete")
|
||||
self.current_chunk += 1
|
||||
await asyncio.sleep(ITER_DELAY)
|
||||
|
||||
async def get_chunk(self):
|
||||
items = []
|
||||
# for source in SOURCES:
|
||||
# key = f"{KEYPREFIX}{source}"
|
||||
length = await db.ar.llen(KEYNAME)
|
||||
start_num = length - CHUNK_SIZE
|
||||
chunk = await db.ar.lrange(KEYNAME, start_num, -1)
|
||||
# chunk = await db.ar.rpop(KEYNAME, CHUNK_SIZE)
|
||||
if not chunk:
|
||||
if length > CHUNK_SIZE:
|
||||
length = CHUNK_SIZE
|
||||
if not length:
|
||||
return
|
||||
for item in chunk:
|
||||
item = orjson.loads(item)
|
||||
items.append(item)
|
||||
if items:
|
||||
await process.spawn_processing_threads(items)
|
||||
await process.spawn_processing_threads(self.current_chunk, length)
|
||||
|
||||
Reference in New Issue
Block a user