Time stuff and switch to gensim for tokenisation

This commit is contained in:
Mark Veidemanis 2022-10-01 14:46:45 +01:00
parent 40cf0c6430
commit 817bfd8835
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
4 changed files with 100 additions and 41 deletions

View File

@ -16,7 +16,8 @@ COPY requirements.txt /code/
COPY discord-patched.tgz /code/ COPY discord-patched.tgz /code/
RUN python -m venv /venv RUN python -m venv /venv
RUN . /venv/bin/activate && pip install -r requirements.txt && python -m spacy download en_core_web_sm RUN . /venv/bin/activate && pip install -r requirements.txt
# && python -m spacy download en_core_web_sm
RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages

View File

@ -15,7 +15,8 @@ pycld2
morfessor morfessor
six six
nltk nltk
spacy #spacy
gensim
python-Levenshtein python-Levenshtein
orjson orjson
uvloop uvloop

View File

@ -5,6 +5,9 @@ import random
# For key generation # For key generation
import string import string
# For timing
import time
# Squash errors # Squash errors
import warnings import warnings
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
@ -16,11 +19,21 @@ from math import ceil
import orjson import orjson
import regex import regex
# Tokenisation
import spacy
# For 4chan message parsing # For 4chan message parsing
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# Tokenisation
# import spacy
from gensim.parsing.preprocessing import ( # stem_text,
preprocess_string,
remove_stopwords,
strip_multiple_whitespaces,
strip_non_alphanum,
strip_numeric,
strip_punctuation,
strip_short,
strip_tags,
)
from numpy import array_split from numpy import array_split
from polyglot.detect.base import logger as polyglot_logger from polyglot.detect.base import logger as polyglot_logger
@ -38,30 +51,17 @@ import util
# 4chan schema # 4chan schema
from schemas.ch4_s import ATTRMAP from schemas.ch4_s import ATTRMAP
# For tokenisation CUSTOM_FILTERS = [
# from gensim.parsing.preprocessing import ( lambda x: x.lower(),
# strip_tags, strip_tags, #
# strip_punctuation, strip_punctuation, #
# strip_numeric, strip_multiple_whitespaces,
# stem_text, strip_numeric,
# strip_multiple_whitespaces, remove_stopwords,
# strip_non_alphanum, strip_short,
# remove_stopwords, # stem_text,
# strip_short, strip_non_alphanum, #
# preprocess_string, ]
# )
# CUSTOM_FILTERS = [
# lambda x: x.lower(),
# strip_tags, #
# strip_punctuation, #
# strip_multiple_whitespaces,
# strip_numeric,
# remove_stopwords,
# strip_short,
# #stem_text,
# strip_non_alphanum, #
# ]
RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+") RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
@ -70,8 +70,8 @@ polyglot_logger.setLevel("ERROR")
warnings.filterwarnings("ignore", category=UserWarning, module="bs4") warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
TAGS = ["NOUN", "ADJ", "VERB", "ADV"] # TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) # nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
log = util.get_logger("process") log = util.get_logger("process")
@ -133,29 +133,48 @@ async def spawn_processing_threads(data):
def process_data(data): def process_data(data):
to_store = [] to_store = []
sentiment_time = 0.0
regex_time = 0.0
polyglot_time = 0.0
date_time = 0.0
nlp_time = 0.0
normalise_time = 0.0
hash_time = 0.0
normal2_time = 0.0
soup_time = 0.0
total_time = 0.0
# Initialise sentiment analyser # Initialise sentiment analyser
analyzer = SentimentIntensityAnalyzer() analyzer = SentimentIntensityAnalyzer()
for msg in data: for msg in data:
total_start = time.process_time()
# normalise fields # normalise fields
start = time.process_time()
for key, value in list(msg.items()): for key, value in list(msg.items()):
if value is None: if value is None:
del msg[key] del msg[key]
time_took = (time.process_time() - start) * 1000
normalise_time += time_took
# Remove invalid UTF-8 characters # Remove invalid UTF-8 characters
# IRC and Discord # IRC and Discord
start = time.process_time()
if "msg" in msg: if "msg" in msg:
msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"]) msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"])
# 4chan - since we change the attributes below # 4chan - since we change the attributes below
if "com" in msg: if "com" in msg:
msg["com"] = RE_BAD_CHARS.sub("", msg["com"]) msg["com"] = RE_BAD_CHARS.sub("", msg["com"])
time_took = (time.process_time() - start) * 1000
regex_time += time_took
if msg["src"] == "4ch": if msg["src"] == "4ch":
board = msg["net"] board = msg["net"]
thread = msg["channel"] thread = msg["channel"]
# Calculate hash for post # Calculate hash for post
start = time.process_time()
post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS) post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
hash = siphash(hash_key, post_normalised) hash = siphash(hash_key, post_normalised)
hash = str(hash) hash = str(hash)
@ -169,11 +188,18 @@ def process_data(data):
else: else:
msg["type"] = "update" msg["type"] = "update"
db.r.set(redis_key, hash) db.r.set(redis_key, hash)
time_took = (time.process_time() - start) * 1000
hash_time += time_took
start = time.process_time()
for key2, value in list(msg.items()): for key2, value in list(msg.items()):
if key2 in ATTRMAP: if key2 in ATTRMAP:
msg[ATTRMAP[key2]] = msg[key2] msg[ATTRMAP[key2]] = msg[key2]
del msg[key2] del msg[key2]
time_took = (time.process_time() - start) * 1000
normal2_time += time_took
start = time.process_time()
if "ts" in msg: if "ts" in msg:
old_time = msg["ts"] old_time = msg["ts"]
# '08/30/22(Tue)02:25:37' # '08/30/22(Tue)02:25:37'
@ -187,15 +213,22 @@ def process_data(data):
msg["ts"] = new_ts msg["ts"] = new_ts
else: else:
raise Exception("No TS in msg") raise Exception("No TS in msg")
time_took = (time.process_time() - start) * 1000
date_time += time_took
start = time.process_time()
if "msg" in msg: if "msg" in msg:
soup = BeautifulSoup(msg["msg"], "html.parser") soup = BeautifulSoup(msg["msg"], "html.parser")
msg_str = soup.get_text(separator="\n") msg_str = soup.get_text(separator="\n")
msg["msg"] = msg_str msg["msg"] = msg_str
time_took = (time.process_time() - start) * 1000
soup_time += time_took
# Annotate sentiment/NLP # Annotate sentiment/NLP
if "msg" in msg: if "msg" in msg:
RE_BAD_CHARS.sub("", msg["msg"]) # RE_BAD_CHARS.sub("", msg["msg"])
# Language # Language
start = time.process_time()
text = Text(msg["msg"]) text = Text(msg["msg"])
try: try:
lang_code = text.language.code lang_code = text.language.code
@ -206,22 +239,45 @@ def process_data(data):
log.error(f"Error detecting language: {e}") log.error(f"Error detecting language: {e}")
# So below block doesn't fail # So below block doesn't fail
lang_code = None lang_code = None
time_took = (time.process_time() - start) * 1000
polyglot_time += time_took
# Blatant discrimination # Blatant discrimination
if lang_code == "en": if lang_code == "en":
# Sentiment # Sentiment
start = time.process_time()
vs = analyzer.polarity_scores(str(msg["msg"])) vs = analyzer.polarity_scores(str(msg["msg"]))
addendum = vs["compound"] addendum = vs["compound"]
msg["sentiment"] = addendum msg["sentiment"] = addendum
time_took = (time.process_time() - start) * 1000
sentiment_time += time_took
# Tokens # Tokens
n = nlp(msg["msg"]) start = time.process_time()
for tag in TAGS: tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)
tag_name = tag.lower() msg["tokens"] = tokens
tags_flag = [token.lemma_ for token in n if token.pos_ == tag] # n = nlp(msg["msg"])
msg[f"words_{tag_name}"] = tags_flag # for tag in TAGS:
# tag_name = tag.lower()
# tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
# msg[f"words_{tag_name}"] = tags_flag
time_took = (time.process_time() - start) * 1000
nlp_time += time_took
# Add the mutated message to the return buffer # Add the mutated message to the return buffer
to_store.append(msg) to_store.append(msg)
total_time += (time.process_time() - total_start) * 1000
log.debug("=====================================")
log.debug(f"Sentiment: {sentiment_time}")
log.debug(f"Regex: {regex_time}")
log.debug(f"Polyglot: {polyglot_time}")
log.debug(f"Date: {date_time}")
log.debug(f"NLP: {nlp_time}")
log.debug(f"Normalise: {normalise_time}")
log.debug(f"Hash: {hash_time}")
log.debug(f"Normal2: {normal2_time}")
log.debug(f"Soup: {soup_time}")
log.debug(f"Total: {total_time}")
log.debug("=====================================")
return to_store return to_store

View File

@ -16,7 +16,8 @@ pycld2
morfessor morfessor
six six
nltk nltk
spacy #spacy
gensim
python-Levenshtein python-Levenshtein
orjson orjson
uvloop uvloop