Time stuff and switch to gensim for tokenisation

2022-10-01 14:46:45 +01:00 · 2022-10-01 14:46:45 +01:00 · 06e80a9759
parent 5c91f1af87
commit 06e80a9759
4 changed files with 100 additions and 41 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -16,7 +16,8 @@ COPY requirements.txt /code/
 COPY discord-patched.tgz /code/
 RUN python -m venv /venv
-RUN . /venv/bin/activate && pip install -r requirements.txt && python -m spacy download en_core_web_sm
+RUN . /venv/bin/activate && pip install -r requirements.txt
 # && python -m spacy download en_core_web_sm
 RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@ -15,7 +15,8 @@ pycld2
 morfessor
 six
 nltk
-spacy
+#spacy
 gensim
 python-Levenshtein
 orjson
 uvloop
--- a/processing/process.py
+++ b/processing/process.py
@ -5,6 +5,9 @@ import random
 # For key generation
 import string
 # For timing
 import time
 # Squash errors
 import warnings
 from concurrent.futures import ProcessPoolExecutor
@ -16,11 +19,21 @@ from math import ceil
 import orjson
 import regex
 # Tokenisation
 import spacy
 # For 4chan message parsing
 from bs4 import BeautifulSoup
 # Tokenisation
 # import spacy
 from gensim.parsing.preprocessing import (  # stem_text,
    preprocess_string,
    remove_stopwords,
    strip_multiple_whitespaces,
    strip_non_alphanum,
    strip_numeric,
    strip_punctuation,
    strip_short,
    strip_tags,
 )
 from numpy import array_split
 from polyglot.detect.base import logger as polyglot_logger
@ -38,30 +51,17 @@ import util
 # 4chan schema
 from schemas.ch4_s import ATTRMAP
-# For tokenisation
+CUSTOM_FILTERS = [
-# from gensim.parsing.preprocessing import (
+    lambda x: x.lower(),
-#     strip_tags,
+    strip_tags,  #
-#     strip_punctuation,
+    strip_punctuation,  #
-#     strip_numeric,
+    strip_multiple_whitespaces,
-#     stem_text,
+    strip_numeric,
-#     strip_multiple_whitespaces,
+    remove_stopwords,
-#     strip_non_alphanum,
+    strip_short,
-#     remove_stopwords,
+    # stem_text,
-#     strip_short,
+    strip_non_alphanum,  #
-#     preprocess_string,
+]
 # )
 # CUSTOM_FILTERS = [
 #     lambda x: x.lower(),
 #     strip_tags,  #
 #     strip_punctuation,  #
 #     strip_multiple_whitespaces,
 #     strip_numeric,
 #     remove_stopwords,
 #     strip_short,
 #     #stem_text,
 #     strip_non_alphanum,  #
 # ]
 RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
@ -70,8 +70,8 @@ polyglot_logger.setLevel("ERROR")
 warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
-TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
+# TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
-nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+# nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
 log = util.get_logger("process")
@ -133,29 +133,48 @@ async def spawn_processing_threads(data):
 def process_data(data):
    to_store = []
    sentiment_time = 0.0
    regex_time = 0.0
    polyglot_time = 0.0
    date_time = 0.0
    nlp_time = 0.0
    normalise_time = 0.0
    hash_time = 0.0
    normal2_time = 0.0
    soup_time = 0.0
    total_time = 0.0
    # Initialise sentiment analyser
    analyzer = SentimentIntensityAnalyzer()
    for msg in data:
-
+        total_start = time.process_time()
        # normalise fields
        start = time.process_time()
        for key, value in list(msg.items()):
            if value is None:
                del msg[key]
        time_took = (time.process_time() - start) * 1000
        normalise_time += time_took
        # Remove invalid UTF-8 characters
        # IRC and Discord
        start = time.process_time()
        if "msg" in msg:
            msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"])
        # 4chan - since we change the attributes below
        if "com" in msg:
            msg["com"] = RE_BAD_CHARS.sub("", msg["com"])
        time_took = (time.process_time() - start) * 1000
        regex_time += time_took
        if msg["src"] == "4ch":
            board = msg["net"]
            thread = msg["channel"]
            # Calculate hash for post
            start = time.process_time()
            post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
            hash = siphash(hash_key, post_normalised)
            hash = str(hash)
@ -169,11 +188,18 @@ def process_data(data):
                else:
                    msg["type"] = "update"
            db.r.set(redis_key, hash)
            time_took = (time.process_time() - start) * 1000
            hash_time += time_took
            start = time.process_time()
            for key2, value in list(msg.items()):
                if key2 in ATTRMAP:
                    msg[ATTRMAP[key2]] = msg[key2]
                    del msg[key2]
            time_took = (time.process_time() - start) * 1000
            normal2_time += time_took
            start = time.process_time()
            if "ts" in msg:
                old_time = msg["ts"]
                # '08/30/22(Tue)02:25:37'
@ -187,15 +213,22 @@ def process_data(data):
                msg["ts"] = new_ts
            else:
                raise Exception("No TS in msg")
            time_took = (time.process_time() - start) * 1000
            date_time += time_took
            start = time.process_time()
            if "msg" in msg:
                soup = BeautifulSoup(msg["msg"], "html.parser")
                msg_str = soup.get_text(separator="\n")
                msg["msg"] = msg_str
            time_took = (time.process_time() - start) * 1000
            soup_time += time_took
        # Annotate sentiment/NLP
        if "msg" in msg:
-            RE_BAD_CHARS.sub("", msg["msg"])
+            # RE_BAD_CHARS.sub("", msg["msg"])
            # Language
            start = time.process_time()
            text = Text(msg["msg"])
            try:
                lang_code = text.language.code
@ -206,22 +239,45 @@ def process_data(data):
                log.error(f"Error detecting language: {e}")
                # So below block doesn't fail
                lang_code = None
            time_took = (time.process_time() - start) * 1000
            polyglot_time += time_took
            # Blatant discrimination
            if lang_code == "en":
                # Sentiment
                start = time.process_time()
                vs = analyzer.polarity_scores(str(msg["msg"]))
                addendum = vs["compound"]
                msg["sentiment"] = addendum
                time_took = (time.process_time() - start) * 1000
                sentiment_time += time_took
-                # Tokens
+            # Tokens
-                n = nlp(msg["msg"])
+            start = time.process_time()
-                for tag in TAGS:
+            tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)
-                    tag_name = tag.lower()
+            msg["tokens"] = tokens
-                    tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
+            # n = nlp(msg["msg"])
-                    msg[f"words_{tag_name}"] = tags_flag
+            # for tag in TAGS:
            #     tag_name = tag.lower()
            #     tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
            #     msg[f"words_{tag_name}"] = tags_flag
            time_took = (time.process_time() - start) * 1000
            nlp_time += time_took
        # Add the mutated message to the return buffer
        to_store.append(msg)
        total_time += (time.process_time() - total_start) * 1000
    log.debug("=====================================")
    log.debug(f"Sentiment: {sentiment_time}")
    log.debug(f"Regex: {regex_time}")
    log.debug(f"Polyglot: {polyglot_time}")
    log.debug(f"Date: {date_time}")
    log.debug(f"NLP: {nlp_time}")
    log.debug(f"Normalise: {normalise_time}")
    log.debug(f"Hash: {hash_time}")
    log.debug(f"Normal2: {normal2_time}")
    log.debug(f"Soup: {soup_time}")
    log.debug(f"Total: {total_time}")
    log.debug("=====================================")
    return to_store
--- a/requirements.txt
+++ b/requirements.txt
@ -16,7 +16,8 @@ pycld2
 morfessor
 six
 nltk
-spacy
+#spacy
 gensim
 python-Levenshtein
 orjson
 uvloop