From 06e80a9759c4be7203db75a09b6d54a59e493f4e Mon Sep 17 00:00:00 2001
From: Mark Veidemanis <m@zm.is>
Date: Sat, 1 Oct 2022 14:46:45 +0100
Subject: [PATCH] Time stuff and switch to gensim for tokenisation

---
 docker/Dockerfile       |   3 +-
 docker/requirements.txt |   3 +-
 processing/process.py   | 134 ++++++++++++++++++++++++++++------------
 requirements.txt        |   3 +-
 4 files changed, 101 insertions(+), 42 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0da9448..cd5dd99 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -16,7 +16,8 @@ COPY requirements.txt /code/
 COPY discord-patched.tgz /code/
 
 RUN python -m venv /venv
-RUN . /venv/bin/activate && pip install -r requirements.txt && python -m spacy download en_core_web_sm
+RUN . /venv/bin/activate && pip install -r requirements.txt
+# && python -m spacy download en_core_web_sm
 
 RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages
 
diff --git a/docker/requirements.txt b/docker/requirements.txt
index e8fb9c4..b3f7703 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -15,7 +15,8 @@ pycld2
 morfessor
 six
 nltk
-spacy
+#spacy
+gensim
 python-Levenshtein
 orjson
 uvloop
diff --git a/processing/process.py b/processing/process.py
index 6110b92..4082046 100644
--- a/processing/process.py
+++ b/processing/process.py
@@ -5,6 +5,9 @@ import random
 # For key generation
 import string
 
+# For timing
+import time
+
 # Squash errors
 import warnings
 from concurrent.futures import ProcessPoolExecutor
@@ -16,11 +19,21 @@ from math import ceil
 import orjson
 import regex
 
-# Tokenisation
-import spacy
-
 # For 4chan message parsing
 from bs4 import BeautifulSoup
+
+# Tokenisation
+# import spacy
+from gensim.parsing.preprocessing import (  # stem_text,
+    preprocess_string,
+    remove_stopwords,
+    strip_multiple_whitespaces,
+    strip_non_alphanum,
+    strip_numeric,
+    strip_punctuation,
+    strip_short,
+    strip_tags,
+)
 from numpy import array_split
 from polyglot.detect.base import logger as polyglot_logger
 
@@ -38,30 +51,17 @@ import util
 # 4chan schema
 from schemas.ch4_s import ATTRMAP
 
-# For tokenisation
-# from gensim.parsing.preprocessing import (
-#     strip_tags,
-#     strip_punctuation,
-#     strip_numeric,
-#     stem_text,
-#     strip_multiple_whitespaces,
-#     strip_non_alphanum,
-#     remove_stopwords,
-#     strip_short,
-#     preprocess_string,
-# )
-
-# CUSTOM_FILTERS = [
-#     lambda x: x.lower(),
-#     strip_tags,  #
-#     strip_punctuation,  #
-#     strip_multiple_whitespaces,
-#     strip_numeric,
-#     remove_stopwords,
-#     strip_short,
-#     #stem_text,
-#     strip_non_alphanum,  #
-# ]
+CUSTOM_FILTERS = [
+    lambda x: x.lower(),
+    strip_tags,  #
+    strip_punctuation,  #
+    strip_multiple_whitespaces,
+    strip_numeric,
+    remove_stopwords,
+    strip_short,
+    # stem_text,
+    strip_non_alphanum,  #
+]
 
 RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
 
@@ -70,8 +70,8 @@ polyglot_logger.setLevel("ERROR")
 warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
 
 
-TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
-nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+# TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
+# nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
 
 
 log = util.get_logger("process")
@@ -133,29 +133,48 @@ async def spawn_processing_threads(data):
 def process_data(data):
     to_store = []
 
+    sentiment_time = 0.0
+    regex_time = 0.0
+    polyglot_time = 0.0
+    date_time = 0.0
+    nlp_time = 0.0
+    normalise_time = 0.0
+    hash_time = 0.0
+    normal2_time = 0.0
+    soup_time = 0.0
+
+    total_time = 0.0
+
     # Initialise sentiment analyser
     analyzer = SentimentIntensityAnalyzer()
     for msg in data:
-
+        total_start = time.process_time()
         # normalise fields
+        start = time.process_time()
         for key, value in list(msg.items()):
             if value is None:
                 del msg[key]
+        time_took = (time.process_time() - start) * 1000
+        normalise_time += time_took
 
         # Remove invalid UTF-8 characters
         # IRC and Discord
+        start = time.process_time()
         if "msg" in msg:
             msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"])
 
         # 4chan - since we change the attributes below
         if "com" in msg:
             msg["com"] = RE_BAD_CHARS.sub("", msg["com"])
+        time_took = (time.process_time() - start) * 1000
+        regex_time += time_took
 
         if msg["src"] == "4ch":
             board = msg["net"]
             thread = msg["channel"]
 
             # Calculate hash for post
+            start = time.process_time()
             post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
             hash = siphash(hash_key, post_normalised)
             hash = str(hash)
@@ -169,11 +188,18 @@ def process_data(data):
                 else:
                     msg["type"] = "update"
             db.r.set(redis_key, hash)
+            time_took = (time.process_time() - start) * 1000
+            hash_time += time_took
+
+            start = time.process_time()
             for key2, value in list(msg.items()):
                 if key2 in ATTRMAP:
                     msg[ATTRMAP[key2]] = msg[key2]
                     del msg[key2]
+            time_took = (time.process_time() - start) * 1000
+            normal2_time += time_took
 
+            start = time.process_time()
             if "ts" in msg:
                 old_time = msg["ts"]
                 # '08/30/22(Tue)02:25:37'
@@ -187,15 +213,22 @@ def process_data(data):
                 msg["ts"] = new_ts
             else:
                 raise Exception("No TS in msg")
+            time_took = (time.process_time() - start) * 1000
+            date_time += time_took
+
+            start = time.process_time()
             if "msg" in msg:
                 soup = BeautifulSoup(msg["msg"], "html.parser")
                 msg_str = soup.get_text(separator="\n")
                 msg["msg"] = msg_str
+            time_took = (time.process_time() - start) * 1000
+            soup_time += time_took
 
         # Annotate sentiment/NLP
         if "msg" in msg:
-            RE_BAD_CHARS.sub("", msg["msg"])
+            # RE_BAD_CHARS.sub("", msg["msg"])
             # Language
+            start = time.process_time()
             text = Text(msg["msg"])
             try:
                 lang_code = text.language.code
@@ -206,22 +239,45 @@ def process_data(data):
                 log.error(f"Error detecting language: {e}")
                 # So below block doesn't fail
                 lang_code = None
+            time_took = (time.process_time() - start) * 1000
+            polyglot_time += time_took
 
             # Blatant discrimination
             if lang_code == "en":
-
                 # Sentiment
+                start = time.process_time()
                 vs = analyzer.polarity_scores(str(msg["msg"]))
                 addendum = vs["compound"]
                 msg["sentiment"] = addendum
-
-                # Tokens
-                n = nlp(msg["msg"])
-                for tag in TAGS:
-                    tag_name = tag.lower()
-                    tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
-                    msg[f"words_{tag_name}"] = tags_flag
+                time_took = (time.process_time() - start) * 1000
+                sentiment_time += time_took
+
+            # Tokens
+            start = time.process_time()
+            tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)
+            msg["tokens"] = tokens
+            # n = nlp(msg["msg"])
+            # for tag in TAGS:
+            #     tag_name = tag.lower()
+            #     tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
+            #     msg[f"words_{tag_name}"] = tags_flag
+            time_took = (time.process_time() - start) * 1000
+            nlp_time += time_took
 
         # Add the mutated message to the return buffer
         to_store.append(msg)
+        total_time += (time.process_time() - total_start) * 1000
+    log.debug("=====================================")
+    log.debug(f"Sentiment: {sentiment_time}")
+    log.debug(f"Regex: {regex_time}")
+    log.debug(f"Polyglot: {polyglot_time}")
+    log.debug(f"Date: {date_time}")
+    log.debug(f"NLP: {nlp_time}")
+    log.debug(f"Normalise: {normalise_time}")
+    log.debug(f"Hash: {hash_time}")
+    log.debug(f"Normal2: {normal2_time}")
+    log.debug(f"Soup: {soup_time}")
+    log.debug(f"Total: {total_time}")
+    log.debug("=====================================")
+
     return to_store
diff --git a/requirements.txt b/requirements.txt
index 81131a3..5bc11b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,8 @@ pycld2
 morfessor
 six
 nltk
-spacy
+#spacy
+gensim
 python-Levenshtein
 orjson
 uvloop