From 06e80a9759c4be7203db75a09b6d54a59e493f4e Mon Sep 17 00:00:00 2001 From: Mark Veidemanis Date: Sat, 1 Oct 2022 14:46:45 +0100 Subject: [PATCH] Time stuff and switch to gensim for tokenisation --- docker/Dockerfile | 3 +- docker/requirements.txt | 3 +- processing/process.py | 134 ++++++++++++++++++++++++++++------------ requirements.txt | 3 +- 4 files changed, 101 insertions(+), 42 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0da9448..cd5dd99 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,7 +16,8 @@ COPY requirements.txt /code/ COPY discord-patched.tgz /code/ RUN python -m venv /venv -RUN . /venv/bin/activate && pip install -r requirements.txt && python -m spacy download en_core_web_sm +RUN . /venv/bin/activate && pip install -r requirements.txt +# && python -m spacy download en_core_web_sm RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages diff --git a/docker/requirements.txt b/docker/requirements.txt index e8fb9c4..b3f7703 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -15,7 +15,8 @@ pycld2 morfessor six nltk -spacy +#spacy +gensim python-Levenshtein orjson uvloop diff --git a/processing/process.py b/processing/process.py index 6110b92..4082046 100644 --- a/processing/process.py +++ b/processing/process.py @@ -5,6 +5,9 @@ import random # For key generation import string +# For timing +import time + # Squash errors import warnings from concurrent.futures import ProcessPoolExecutor @@ -16,11 +19,21 @@ from math import ceil import orjson import regex -# Tokenisation -import spacy - # For 4chan message parsing from bs4 import BeautifulSoup + +# Tokenisation +# import spacy +from gensim.parsing.preprocessing import ( # stem_text, + preprocess_string, + remove_stopwords, + strip_multiple_whitespaces, + strip_non_alphanum, + strip_numeric, + strip_punctuation, + strip_short, + strip_tags, +) from numpy import array_split from polyglot.detect.base import logger as polyglot_logger @@ -38,30 +51,17 @@ import util # 4chan schema from schemas.ch4_s import ATTRMAP -# For tokenisation -# from gensim.parsing.preprocessing import ( -# strip_tags, -# strip_punctuation, -# strip_numeric, -# stem_text, -# strip_multiple_whitespaces, -# strip_non_alphanum, -# remove_stopwords, -# strip_short, -# preprocess_string, -# ) - -# CUSTOM_FILTERS = [ -# lambda x: x.lower(), -# strip_tags, # -# strip_punctuation, # -# strip_multiple_whitespaces, -# strip_numeric, -# remove_stopwords, -# strip_short, -# #stem_text, -# strip_non_alphanum, # -# ] +CUSTOM_FILTERS = [ + lambda x: x.lower(), + strip_tags, # + strip_punctuation, # + strip_multiple_whitespaces, + strip_numeric, + remove_stopwords, + strip_short, + # stem_text, + strip_non_alphanum, # +] RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+") @@ -70,8 +70,8 @@ polyglot_logger.setLevel("ERROR") warnings.filterwarnings("ignore", category=UserWarning, module="bs4") -TAGS = ["NOUN", "ADJ", "VERB", "ADV"] -nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) +# TAGS = ["NOUN", "ADJ", "VERB", "ADV"] +# nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) log = util.get_logger("process") @@ -133,29 +133,48 @@ async def spawn_processing_threads(data): def process_data(data): to_store = [] + sentiment_time = 0.0 + regex_time = 0.0 + polyglot_time = 0.0 + date_time = 0.0 + nlp_time = 0.0 + normalise_time = 0.0 + hash_time = 0.0 + normal2_time = 0.0 + soup_time = 0.0 + + total_time = 0.0 + # Initialise sentiment analyser analyzer = SentimentIntensityAnalyzer() for msg in data: - + total_start = time.process_time() # normalise fields + start = time.process_time() for key, value in list(msg.items()): if value is None: del msg[key] + time_took = (time.process_time() - start) * 1000 + normalise_time += time_took # Remove invalid UTF-8 characters # IRC and Discord + start = time.process_time() if "msg" in msg: msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"]) # 4chan - since we change the attributes below if "com" in msg: msg["com"] = RE_BAD_CHARS.sub("", msg["com"]) + time_took = (time.process_time() - start) * 1000 + regex_time += time_took if msg["src"] == "4ch": board = msg["net"] thread = msg["channel"] # Calculate hash for post + start = time.process_time() post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS) hash = siphash(hash_key, post_normalised) hash = str(hash) @@ -169,11 +188,18 @@ def process_data(data): else: msg["type"] = "update" db.r.set(redis_key, hash) + time_took = (time.process_time() - start) * 1000 + hash_time += time_took + + start = time.process_time() for key2, value in list(msg.items()): if key2 in ATTRMAP: msg[ATTRMAP[key2]] = msg[key2] del msg[key2] + time_took = (time.process_time() - start) * 1000 + normal2_time += time_took + start = time.process_time() if "ts" in msg: old_time = msg["ts"] # '08/30/22(Tue)02:25:37' @@ -187,15 +213,22 @@ def process_data(data): msg["ts"] = new_ts else: raise Exception("No TS in msg") + time_took = (time.process_time() - start) * 1000 + date_time += time_took + + start = time.process_time() if "msg" in msg: soup = BeautifulSoup(msg["msg"], "html.parser") msg_str = soup.get_text(separator="\n") msg["msg"] = msg_str + time_took = (time.process_time() - start) * 1000 + soup_time += time_took # Annotate sentiment/NLP if "msg" in msg: - RE_BAD_CHARS.sub("", msg["msg"]) + # RE_BAD_CHARS.sub("", msg["msg"]) # Language + start = time.process_time() text = Text(msg["msg"]) try: lang_code = text.language.code @@ -206,22 +239,45 @@ def process_data(data): log.error(f"Error detecting language: {e}") # So below block doesn't fail lang_code = None + time_took = (time.process_time() - start) * 1000 + polyglot_time += time_took # Blatant discrimination if lang_code == "en": - # Sentiment + start = time.process_time() vs = analyzer.polarity_scores(str(msg["msg"])) addendum = vs["compound"] msg["sentiment"] = addendum - - # Tokens - n = nlp(msg["msg"]) - for tag in TAGS: - tag_name = tag.lower() - tags_flag = [token.lemma_ for token in n if token.pos_ == tag] - msg[f"words_{tag_name}"] = tags_flag + time_took = (time.process_time() - start) * 1000 + sentiment_time += time_took + + # Tokens + start = time.process_time() + tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS) + msg["tokens"] = tokens + # n = nlp(msg["msg"]) + # for tag in TAGS: + # tag_name = tag.lower() + # tags_flag = [token.lemma_ for token in n if token.pos_ == tag] + # msg[f"words_{tag_name}"] = tags_flag + time_took = (time.process_time() - start) * 1000 + nlp_time += time_took # Add the mutated message to the return buffer to_store.append(msg) + total_time += (time.process_time() - total_start) * 1000 + log.debug("=====================================") + log.debug(f"Sentiment: {sentiment_time}") + log.debug(f"Regex: {regex_time}") + log.debug(f"Polyglot: {polyglot_time}") + log.debug(f"Date: {date_time}") + log.debug(f"NLP: {nlp_time}") + log.debug(f"Normalise: {normalise_time}") + log.debug(f"Hash: {hash_time}") + log.debug(f"Normal2: {normal2_time}") + log.debug(f"Soup: {soup_time}") + log.debug(f"Total: {total_time}") + log.debug("=====================================") + return to_store diff --git a/requirements.txt b/requirements.txt index 81131a3..5bc11b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,8 @@ pycld2 morfessor six nltk -spacy +#spacy +gensim python-Levenshtein orjson uvloop