Time stuff and switch to gensim for tokenisation

2022-10-01 14:46:45 +01:00 · 2022-10-01 14:46:45 +01:00 · 817bfd8835
parent 40cf0c6430
commit 817bfd8835
4 changed files with 100 additions and 41 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -16,7 +16,8 @@ COPY requirements.txt /code/
 COPY discord-patched.tgz /code/

 RUN python -m venv /venv
-RUN . /venv/bin/activate && pip install -r requirements.txt && python -m spacy download en_core_web_sm
+RUN . /venv/bin/activate && pip install -r requirements.txt
+# && python -m spacy download en_core_web_sm

 RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages

--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@ -15,7 +15,8 @@ pycld2
 morfessor
 six
 nltk
-spacy
+#spacy
+gensim
 python-Levenshtein
 orjson
 uvloop
--- a/processing/process.py
+++ b/processing/process.py
@ -5,6 +5,9 @@ import random
 # For key generation
 import string

+# For timing
+import time
+
 # Squash errors
 import warnings
 from concurrent.futures import ProcessPoolExecutor
@ -16,11 +19,21 @@ from math import ceil
 import orjson
 import regex

-# Tokenisation
-import spacy
-
 # For 4chan message parsing
 from bs4 import BeautifulSoup
+
+# Tokenisation
+# import spacy
+from gensim.parsing.preprocessing import (  # stem_text,
+    preprocess_string,
+    remove_stopwords,
+    strip_multiple_whitespaces,
+    strip_non_alphanum,
+    strip_numeric,
+    strip_punctuation,
+    strip_short,
+    strip_tags,
+)
 from numpy import array_split
 from polyglot.detect.base import logger as polyglot_logger

@ -38,30 +51,17 @@ import util
 # 4chan schema
 from schemas.ch4_s import ATTRMAP

-# For tokenisation
-# from gensim.parsing.preprocessing import (
-#     strip_tags,
-#     strip_punctuation,
-#     strip_numeric,
+CUSTOM_FILTERS = [
+    lambda x: x.lower(),
+    strip_tags,  #
+    strip_punctuation,  #
+    strip_multiple_whitespaces,
+    strip_numeric,
+    remove_stopwords,
+    strip_short,
    # stem_text,
-#     strip_multiple_whitespaces,
-#     strip_non_alphanum,
-#     remove_stopwords,
-#     strip_short,
-#     preprocess_string,
-# )
-
-# CUSTOM_FILTERS = [
-#     lambda x: x.lower(),
-#     strip_tags,  #
-#     strip_punctuation,  #
-#     strip_multiple_whitespaces,
-#     strip_numeric,
-#     remove_stopwords,
-#     strip_short,
-#     #stem_text,
-#     strip_non_alphanum,  #
-# ]
+    strip_non_alphanum,  #
+]

 RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")

@ -70,8 +70,8 @@ polyglot_logger.setLevel("ERROR")
 warnings.filterwarnings("ignore", category=UserWarning, module="bs4")


-TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
-nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+# TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
+# nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])


 log = util.get_logger("process")
@ -133,29 +133,48 @@ async def spawn_processing_threads(data):
 def process_data(data):
    to_store = []

+    sentiment_time = 0.0
+    regex_time = 0.0
+    polyglot_time = 0.0
+    date_time = 0.0
+    nlp_time = 0.0
+    normalise_time = 0.0
+    hash_time = 0.0
+    normal2_time = 0.0
+    soup_time = 0.0
+
+    total_time = 0.0
+
    # Initialise sentiment analyser
    analyzer = SentimentIntensityAnalyzer()
    for msg in data:
-
+        total_start = time.process_time()
        # normalise fields
+        start = time.process_time()
        for key, value in list(msg.items()):
            if value is None:
                del msg[key]
+        time_took = (time.process_time() - start) * 1000
+        normalise_time += time_took

        # Remove invalid UTF-8 characters
        # IRC and Discord
+        start = time.process_time()
        if "msg" in msg:
            msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"])

        # 4chan - since we change the attributes below
        if "com" in msg:
            msg["com"] = RE_BAD_CHARS.sub("", msg["com"])
+        time_took = (time.process_time() - start) * 1000
+        regex_time += time_took

        if msg["src"] == "4ch":
            board = msg["net"]
            thread = msg["channel"]

            # Calculate hash for post
+            start = time.process_time()
            post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
            hash = siphash(hash_key, post_normalised)
            hash = str(hash)
@ -169,11 +188,18 @@ def process_data(data):
                else:
                    msg["type"] = "update"
            db.r.set(redis_key, hash)
+            time_took = (time.process_time() - start) * 1000
+            hash_time += time_took
+
+            start = time.process_time()
            for key2, value in list(msg.items()):
                if key2 in ATTRMAP:
                    msg[ATTRMAP[key2]] = msg[key2]
                    del msg[key2]
+            time_took = (time.process_time() - start) * 1000
+            normal2_time += time_took

+            start = time.process_time()
            if "ts" in msg:
                old_time = msg["ts"]
                # '08/30/22(Tue)02:25:37'
@ -187,15 +213,22 @@ def process_data(data):
                msg["ts"] = new_ts
            else:
                raise Exception("No TS in msg")
+            time_took = (time.process_time() - start) * 1000
+            date_time += time_took
+
+            start = time.process_time()
            if "msg" in msg:
                soup = BeautifulSoup(msg["msg"], "html.parser")
                msg_str = soup.get_text(separator="\n")
                msg["msg"] = msg_str
+            time_took = (time.process_time() - start) * 1000
+            soup_time += time_took

        # Annotate sentiment/NLP
        if "msg" in msg:
-            RE_BAD_CHARS.sub("", msg["msg"])
+            # RE_BAD_CHARS.sub("", msg["msg"])
            # Language
+            start = time.process_time()
            text = Text(msg["msg"])
            try:
                lang_code = text.language.code
@ -206,22 +239,45 @@ def process_data(data):
                log.error(f"Error detecting language: {e}")
                # So below block doesn't fail
                lang_code = None
+            time_took = (time.process_time() - start) * 1000
+            polyglot_time += time_took

            # Blatant discrimination
            if lang_code == "en":
-
                # Sentiment
+                start = time.process_time()
                vs = analyzer.polarity_scores(str(msg["msg"]))
                addendum = vs["compound"]
                msg["sentiment"] = addendum
+                time_took = (time.process_time() - start) * 1000
+                sentiment_time += time_took

            # Tokens
-                n = nlp(msg["msg"])
-                for tag in TAGS:
-                    tag_name = tag.lower()
-                    tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
-                    msg[f"words_{tag_name}"] = tags_flag
+            start = time.process_time()
+            tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)
+            msg["tokens"] = tokens
+            # n = nlp(msg["msg"])
+            # for tag in TAGS:
+            #     tag_name = tag.lower()
+            #     tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
+            #     msg[f"words_{tag_name}"] = tags_flag
+            time_took = (time.process_time() - start) * 1000
+            nlp_time += time_took

        # Add the mutated message to the return buffer
        to_store.append(msg)
+        total_time += (time.process_time() - total_start) * 1000
+    log.debug("=====================================")
+    log.debug(f"Sentiment: {sentiment_time}")
+    log.debug(f"Regex: {regex_time}")
+    log.debug(f"Polyglot: {polyglot_time}")
+    log.debug(f"Date: {date_time}")
+    log.debug(f"NLP: {nlp_time}")
+    log.debug(f"Normalise: {normalise_time}")
+    log.debug(f"Hash: {hash_time}")
+    log.debug(f"Normal2: {normal2_time}")
+    log.debug(f"Soup: {soup_time}")
+    log.debug(f"Total: {total_time}")
+    log.debug("=====================================")
+
    return to_store
--- a/requirements.txt
+++ b/requirements.txt
@ -16,7 +16,8 @@ pycld2
 morfessor
 six
 nltk
-spacy
+#spacy
+gensim
 python-Levenshtein
 orjson
 uvloop