monolith/processing/process.py

import asyncio
import os
import random

# For key generation
import string

# For timing
import time

# Squash errors
import warnings
from concurrent.futures import ProcessPoolExecutor

# For timestamp processing
from datetime import datetime
from os import getenv

import orjson
import regex

# For 4chan message parsing
from bs4 import BeautifulSoup

# Tokenisation
# import spacy
from gensim.parsing.preprocessing import (  # stem_text,
    preprocess_string,
    remove_stopwords,
    strip_multiple_whitespaces,
    strip_non_alphanum,
    strip_numeric,
    strip_punctuation,
    strip_short,
    strip_tags,
)
from polyglot.detect.base import logger as polyglot_logger

# For NLP
from polyglot.text import Text
from pycld2 import error as cld2_error
from siphashc import siphash

# For sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import db
import util

# 4chan schema
from schemas.ch4_s import ATTRMAP

trues = ("true", "1", "t", True)

KEYNAME = "queue"

MONOLITH_PROCESS_PERFSTATS = (
    getenv("MONOLITH_PROCESS_PERFSTATS", "false").lower() in trues
)

CUSTOM_FILTERS = [
    lambda x: x.lower(),
    strip_tags,  #
    strip_punctuation,  #
    strip_multiple_whitespaces,
    strip_numeric,
    remove_stopwords,
    strip_short,
    # stem_text,
    strip_non_alphanum,  #
]

RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")

# Squash errors
polyglot_logger.setLevel("ERROR")
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")


# TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
# nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])


log = util.get_logger("process")

# Maximum number of CPU threads to use for post processing
CPU_THREADS = int(os.getenv("MONOLITH_PROCESS_THREADS", os.cpu_count()))

p = ProcessPoolExecutor(CPU_THREADS)


def get_hash_key():
    hash_key = db.r.get("hashing_key")
    if not hash_key:
        letters = string.ascii_lowercase
        hash_key = "".join(random.choice(letters) for i in range(16))
        log.debug(f"Created new hash key: {hash_key}")
        db.r.set("hashing_key", hash_key)
    else:
        hash_key = hash_key.decode("ascii")
        log.debug(f"Decoded hash key: {hash_key}")
    return hash_key


hash_key = get_hash_key()


@asyncio.coroutine
async def spawn_processing_threads(chunk, length):
    log.debug(f"Spawning processing threads for chunk {chunk} of length {length}")
    loop = asyncio.get_event_loop()
    tasks = []

    if length < CPU_THREADS * 100:
        cores = 1
        chunk_size = length
    else:
        cores = CPU_THREADS
        chunk_size = int(length / cores)

    for index in range(cores):
        log.debug(
            f"[{chunk}/{index}] Delegating {chunk_size} messages to thread {index}"
        )
        task = loop.run_in_executor(p, process_data, chunk, index, chunk_size)
        tasks.append(task)

    results = [await task for task in tasks]

    # Join the results back from the split list
    flat_list = [item for sublist in results for item in sublist]
    log.debug(
        (
            f"[{chunk}/{index}] Results from processing of {length} messages in "
            f"{cores} threads: {len(flat_list)}"
        )
    )
    await db.store_batch(flat_list)

    # log.debug(f"Finished processing {len_data} messages")


def process_data(chunk, index, chunk_size):
    log.debug(f"[{chunk}/{index}] Processing {chunk_size} messages")
    to_store = []

    sentiment_time = 0.0
    regex_time = 0.0
    polyglot_time = 0.0
    date_time = 0.0
    nlp_time = 0.0
    normalise_time = 0.0
    hash_time = 0.0
    normal2_time = 0.0
    soup_time = 0.0

    total_time = 0.0

    # Initialise sentiment analyser
    analyzer = SentimentIntensityAnalyzer()
    for msg_index in range(chunk_size):
        msg = db.r.rpop(KEYNAME)
        if not msg:
            return
        # TODO: msgpack
        msg = orjson.loads(msg)
        total_start = time.process_time()
        # normalise fields
        start = time.process_time()
        for key, value in list(msg.items()):
            if value is None:
                del msg[key]
        time_took = (time.process_time() - start) * 1000
        normalise_time += time_took

        # Remove invalid UTF-8 characters
        # IRC and Discord
        start = time.process_time()
        if "msg" in msg:
            msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"])

        # 4chan - since we change the attributes below
        if "com" in msg:
            msg["com"] = RE_BAD_CHARS.sub("", msg["com"])
        time_took = (time.process_time() - start) * 1000
        regex_time += time_took

        if msg["src"] == "4ch":
            board = msg["net"]
            thread = msg["channel"]

            # Calculate hash for post
            start = time.process_time()
            post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
            hash = siphash(hash_key, post_normalised)
            hash = str(hash)
            redis_key = (
                f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
            )
            key_content = db.r.get(redis_key)
            if key_content is not None:
                key_content = key_content.decode("ascii")
                if key_content == hash:
                    # This deletes the message since the append at the end won't be hit
                    continue
                    # pass
                else:
                    msg["type"] = "update"
            db.r.set(redis_key, hash)
            time_took = (time.process_time() - start) * 1000
            hash_time += time_took

            start = time.process_time()
            for key2, value in list(msg.items()):
                if key2 in ATTRMAP:
                    msg[ATTRMAP[key2]] = msg[key2]
                    del msg[key2]
            time_took = (time.process_time() - start) * 1000
            normal2_time += time_took

            start = time.process_time()
            if "ts" in msg:
                old_time = msg["ts"]
                # '08/30/22(Tue)02:25:37'
                time_spl = old_time.split(":")
                if len(time_spl) == 3:
                    old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
                else:
                    old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
                # new_ts = old_ts.isoformat()
                new_ts = int(old_ts.timestamp())
                msg["ts"] = new_ts
            else:
                raise Exception("No TS in msg")
            time_took = (time.process_time() - start) * 1000
            date_time += time_took

            start = time.process_time()
            if "msg" in msg:
                soup = BeautifulSoup(msg["msg"], "html.parser")
                msg_str = soup.get_text(separator="\n")
                msg["msg"] = msg_str
            time_took = (time.process_time() - start) * 1000
            soup_time += time_took

        # Annotate sentiment/NLP
        if "msg" in msg:
            # RE_BAD_CHARS.sub("", msg["msg"])
            # Language
            start = time.process_time()
            text = Text(msg["msg"])
            try:
                lang_code = text.language.code
                lang_name = text.language.name
                msg["lang_code"] = lang_code
                msg["lang_name"] = lang_name
            except cld2_error as e:
                log.error(f"[{chunk}/{index}] Error detecting language: {e}")
                # So below block doesn't fail
                lang_code = None
            time_took = (time.process_time() - start) * 1000
            polyglot_time += time_took

            # Blatant discrimination
            if lang_code == "en":
                # Sentiment
                start = time.process_time()
                vs = analyzer.polarity_scores(str(msg["msg"]))
                addendum = vs["compound"]
                msg["sentiment"] = addendum
                time_took = (time.process_time() - start) * 1000
                sentiment_time += time_took

            # Tokens
            start = time.process_time()
            tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)
            msg["tokens"] = tokens
            # n = nlp(msg["msg"])
            # for tag in TAGS:
            #     tag_name = tag.lower()
            #     tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
            #     msg[f"words_{tag_name}"] = tags_flag
            time_took = (time.process_time() - start) * 1000
            nlp_time += time_took

        # Add the mutated message to the return buffer
        to_store.append(msg)
        total_time += (time.process_time() - total_start) * 1000

    if MONOLITH_PROCESS_PERFSTATS:
        log.debug("=====================================")
        log.debug(f"Chunk: {chunk}")
        log.debug(f"Index: {index}")
        log.debug(f"Sentiment: {sentiment_time}")
        log.debug(f"Regex: {regex_time}")
        log.debug(f"Polyglot: {polyglot_time}")
        log.debug(f"Date: {date_time}")
        log.debug(f"NLP: {nlp_time}")
        log.debug(f"Normalise: {normalise_time}")
        log.debug(f"Hash: {hash_time}")
        log.debug(f"Normal2: {normal2_time}")
        log.debug(f"Soup: {soup_time}")
        log.debug(f"Total: {total_time}")
        log.debug("=====================================")

    return to_store
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`import asyncio`
			`import os`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`import random`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
			`# For key generation`
			`import string`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`# For timing`
			`import time`

Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# Squash errors`
			`import warnings`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`from concurrent.futures import ProcessPoolExecutor`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
			`# For timestamp processing`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`from datetime import datetime`
Add some environment variables to control debug output 2022-10-21 06:20:30 +00:00			`from os import getenv`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`import orjson`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00			`import regex`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`# For 4chan message parsing`
			`from bs4 import BeautifulSoup`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00
			`# Tokenisation`
			`# import spacy`
			`from gensim.parsing.preprocessing import ( # stem_text,`
			`preprocess_string,`
			`remove_stopwords,`
			`strip_multiple_whitespaces,`
			`strip_non_alphanum,`
			`strip_numeric,`
			`strip_punctuation,`
			`strip_short,`
			`strip_tags,`
			`)`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`from polyglot.detect.base import logger as polyglot_logger`

			`# For NLP`
			`from polyglot.text import Text`
			`from pycld2 import error as cld2_error`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`from siphashc import siphash`

Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# For sentiment`
			`from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer`

Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`import db`
			`import util`

			`# 4chan schema`
			`from schemas.ch4_s import ATTRMAP`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
Add some environment variables to control debug output 2022-10-21 06:20:30 +00:00			`trues = ("true", "1", "t", True)`

Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`KEYNAME = "queue"`

Add some environment variables to control debug output 2022-10-21 06:20:30 +00:00			`MONOLITH_PROCESS_PERFSTATS = (`
			`getenv("MONOLITH_PROCESS_PERFSTATS", "false").lower() in trues`
			`)`

Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`CUSTOM_FILTERS = [`
			`lambda x: x.lower(),`
			`strip_tags, #`
			`strip_punctuation, #`
			`strip_multiple_whitespaces,`
			`strip_numeric,`
			`remove_stopwords,`
			`strip_short,`
			`# stem_text,`
			`strip_non_alphanum, #`
			`]`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00			`RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")`

Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# Squash errors`
			`polyglot_logger.setLevel("ERROR")`
			`warnings.filterwarnings("ignore", category=UserWarning, module="bs4")`


Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`# TAGS = ["NOUN", "ADJ", "VERB", "ADV"]`
			`# nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00

Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`log = util.get_logger("process")`

			`# Maximum number of CPU threads to use for post processing`
Make CPU threads configurable 2022-09-20 21:29:13 +00:00			`CPU_THREADS = int(os.getenv("MONOLITH_PROCESS_THREADS", os.cpu_count()))`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00
			`p = ProcessPoolExecutor(CPU_THREADS)`

Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`def get_hash_key():`
			`hash_key = db.r.get("hashing_key")`
			`if not hash_key:`
			`letters = string.ascii_lowercase`
			`hash_key = "".join(random.choice(letters) for i in range(16))`
			`log.debug(f"Created new hash key: {hash_key}")`
			`db.r.set("hashing_key", hash_key)`
			`else:`
			`hash_key = hash_key.decode("ascii")`
			`log.debug(f"Decoded hash key: {hash_key}")`
			`return hash_key`

Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`hash_key = get_hash_key()`

Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
			`@asyncio.coroutine`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`async def spawn_processing_threads(chunk, length):`
			`log.debug(f"Spawning processing threads for chunk {chunk} of length {length}")`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`loop = asyncio.get_event_loop()`
			`tasks = []`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`if length < CPU_THREADS * 100:`
			`cores = 1`
			`chunk_size = length`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`else:`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`cores = CPU_THREADS`
			`chunk_size = int(length / cores)`

			`for index in range(cores):`
			`log.debug(`
			`f"[{chunk}/{index}] Delegating {chunk_size} messages to thread {index}"`
			`)`
			`task = loop.run_in_executor(p, process_data, chunk, index, chunk_size)`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`tasks.append(task)`

			`results = [await task for task in tasks]`
Print the length of the flattened list in debug message 2022-10-21 06:20:30 +00:00
			`# Join the results back from the split list`
			`flat_list = [item for sublist in results for item in sublist]`
Make debug output cleaner 2022-09-22 16:39:29 +00:00			`log.debug(`
			`(`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`f"[{chunk}/{index}] Results from processing of {length} messages in "`
			`f"{cores} threads: {len(flat_list)}"`
Make debug output cleaner 2022-09-22 16:39:29 +00:00			`)`
			`)`
Fully implement Elasticsearch indexing 2022-11-22 20:15:02 +00:00			`await db.store_batch(flat_list)`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00
Make debug output cleaner 2022-09-22 16:39:29 +00:00			`# log.debug(f"Finished processing {len_data} messages")`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00

Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`def process_data(chunk, index, chunk_size):`
			`log.debug(f"[{chunk}/{index}] Processing {chunk_size} messages")`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`to_store = []`

Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`sentiment_time = 0.0`
			`regex_time = 0.0`
			`polyglot_time = 0.0`
			`date_time = 0.0`
			`nlp_time = 0.0`
			`normalise_time = 0.0`
			`hash_time = 0.0`
			`normal2_time = 0.0`
			`soup_time = 0.0`

			`total_time = 0.0`

Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# Initialise sentiment analyser`
			`analyzer = SentimentIntensityAnalyzer()`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`for msg_index in range(chunk_size):`
			`msg = db.r.rpop(KEYNAME)`
			`if not msg:`
			`return`
Send messages to Neptune Redis via PubSub 2023-01-12 07:20:43 +00:00			`# TODO: msgpack`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`msg = orjson.loads(msg)`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`total_start = time.process_time()`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00			`# normalise fields`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`start = time.process_time()`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00			`for key, value in list(msg.items()):`
			`if value is None:`
			`del msg[key]`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`normalise_time += time_took`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00
			`# Remove invalid UTF-8 characters`
			`# IRC and Discord`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`start = time.process_time()`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00			`if "msg" in msg:`
			`msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"])`

			`# 4chan - since we change the attributes below`
			`if "com" in msg:`
Remove debugging code and fix regex substitution 2022-09-21 11:48:54 +00:00			`msg["com"] = RE_BAD_CHARS.sub("", msg["com"])`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`regex_time += time_took`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`if msg["src"] == "4ch":`
			`board = msg["net"]`
			`thread = msg["channel"]`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`# Calculate hash for post`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`start = time.process_time()`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`hash = siphash(hash_key, post_normalised)`
			`hash = str(hash)`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`redis_key = (`
			`f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"`
			`)`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`key_content = db.r.get(redis_key)`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`if key_content is not None:`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`key_content = key_content.decode("ascii")`
			`if key_content == hash:`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# This deletes the message since the append at the end won't be hit`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`continue`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`# pass`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`else:`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`msg["type"] = "update"`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`db.r.set(redis_key, hash)`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`hash_time += time_took`

			`start = time.process_time()`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`for key2, value in list(msg.items()):`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`if key2 in ATTRMAP:`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`msg[ATTRMAP[key2]] = msg[key2]`
			`del msg[key2]`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`normal2_time += time_took`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`start = time.process_time()`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`if "ts" in msg:`
			`old_time = msg["ts"]`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`# '08/30/22(Tue)02:25:37'`
			`time_spl = old_time.split(":")`
			`if len(time_spl) == 3:`
			`old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")`
			`else:`
			`old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")`
			`# new_ts = old_ts.isoformat()`
			`new_ts = int(old_ts.timestamp())`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`msg["ts"] = new_ts`
Properly process Redis buffered messages and ingest into Kafka 2022-09-14 17:32:32 +00:00			`else:`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`raise Exception("No TS in msg")`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`date_time += time_took`

			`start = time.process_time()`
Ingest into Kafka and queue messages better 2022-09-13 21:17:46 +00:00			`if "msg" in msg:`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`soup = BeautifulSoup(msg["msg"], "html.parser")`
			`msg_str = soup.get_text(separator="\n")`
			`msg["msg"] = msg_str`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`soup_time += time_took`
Normalise fields in processing and remove invalid characters 2022-09-21 09:01:12 +00:00
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# Annotate sentiment/NLP`
			`if "msg" in msg:`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`# RE_BAD_CHARS.sub("", msg["msg"])`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# Language`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`start = time.process_time()`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`text = Text(msg["msg"])`
			`try:`
			`lang_code = text.language.code`
			`lang_name = text.language.name`
			`msg["lang_code"] = lang_code`
			`msg["lang_name"] = lang_name`
			`except cld2_error as e:`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`log.error(f"[{chunk}/{index}] Error detecting language: {e}")`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`# So below block doesn't fail`
			`lang_code = None`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`polyglot_time += time_took`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00
			`# Blatant discrimination`
			`if lang_code == "en":`
			`# Sentiment`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`start = time.process_time()`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`vs = analyzer.polarity_scores(str(msg["msg"]))`
			`addendum = vs["compound"]`
			`msg["sentiment"] = addendum`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`time_took = (time.process_time() - start) * 1000`
			`sentiment_time += time_took`

			`# Tokens`
			`start = time.process_time()`
			`tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)`
			`msg["tokens"] = tokens`
			`# n = nlp(msg["msg"])`
			`# for tag in TAGS:`
			`# tag_name = tag.lower()`
			`# tags_flag = [token.lemma_ for token in n if token.pos_ == tag]`
			`# msg[f"words_{tag_name}"] = tags_flag`
			`time_took = (time.process_time() - start) * 1000`
			`nlp_time += time_took`
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00
			`# Add the mutated message to the return buffer`
			`to_store.append(msg)`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00			`total_time += (time.process_time() - total_start) * 1000`
Add some environment variables to control debug output 2022-10-21 06:20:30 +00:00
			`if MONOLITH_PROCESS_PERFSTATS:`
			`log.debug("=====================================")`
Improve memory usage and fix 4chan crawler 2022-10-21 06:20:30 +00:00			`log.debug(f"Chunk: {chunk}")`
			`log.debug(f"Index: {index}")`
Add some environment variables to control debug output 2022-10-21 06:20:30 +00:00			`log.debug(f"Sentiment: {sentiment_time}")`
			`log.debug(f"Regex: {regex_time}")`
			`log.debug(f"Polyglot: {polyglot_time}")`
			`log.debug(f"Date: {date_time}")`
			`log.debug(f"NLP: {nlp_time}")`
			`log.debug(f"Normalise: {normalise_time}")`
			`log.debug(f"Hash: {hash_time}")`
			`log.debug(f"Normal2: {normal2_time}")`
			`log.debug(f"Soup: {soup_time}")`
			`log.debug(f"Total: {total_time}")`
			`log.debug("=====================================")`
Time stuff and switch to gensim for tokenisation 2022-10-01 13:46:45 +00:00
Implement sentiment/NLP annotation and optimise processing 2022-09-16 16:09:49 +00:00			`return to_store`