Time stuff and switch to gensim for tokenisation

This commit is contained in:
Mark Veidemanis 2022-10-01 14:46:45 +01:00
parent 40cf0c6430
commit 817bfd8835
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
4 changed files with 100 additions and 41 deletions

View File

@ -16,7 +16,8 @@ COPY requirements.txt /code/
COPY discord-patched.tgz /code/
RUN python -m venv /venv
RUN . /venv/bin/activate && pip install -r requirements.txt && python -m spacy download en_core_web_sm
RUN . /venv/bin/activate && pip install -r requirements.txt
# && python -m spacy download en_core_web_sm
RUN tar xf /code/discord-patched.tgz -C /venv/lib/python3.10/site-packages

View File

@ -15,7 +15,8 @@ pycld2
morfessor
six
nltk
spacy
#spacy
gensim
python-Levenshtein
orjson
uvloop

View File

@ -5,6 +5,9 @@ import random
# For key generation
import string
# For timing
import time
# Squash errors
import warnings
from concurrent.futures import ProcessPoolExecutor
@ -16,11 +19,21 @@ from math import ceil
import orjson
import regex
# Tokenisation
import spacy
# For 4chan message parsing
from bs4 import BeautifulSoup
# Tokenisation
# import spacy
from gensim.parsing.preprocessing import ( # stem_text,
preprocess_string,
remove_stopwords,
strip_multiple_whitespaces,
strip_non_alphanum,
strip_numeric,
strip_punctuation,
strip_short,
strip_tags,
)
from numpy import array_split
from polyglot.detect.base import logger as polyglot_logger
@ -38,30 +51,17 @@ import util
# 4chan schema
from schemas.ch4_s import ATTRMAP
# For tokenisation
# from gensim.parsing.preprocessing import (
# strip_tags,
# strip_punctuation,
# strip_numeric,
# stem_text,
# strip_multiple_whitespaces,
# strip_non_alphanum,
# remove_stopwords,
# strip_short,
# preprocess_string,
# )
# CUSTOM_FILTERS = [
# lambda x: x.lower(),
# strip_tags, #
# strip_punctuation, #
# strip_multiple_whitespaces,
# strip_numeric,
# remove_stopwords,
# strip_short,
# #stem_text,
# strip_non_alphanum, #
# ]
CUSTOM_FILTERS = [
lambda x: x.lower(),
strip_tags, #
strip_punctuation, #
strip_multiple_whitespaces,
strip_numeric,
remove_stopwords,
strip_short,
# stem_text,
strip_non_alphanum, #
]
RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
@ -70,8 +70,8 @@ polyglot_logger.setLevel("ERROR")
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
# TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
# nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
log = util.get_logger("process")
@ -133,29 +133,48 @@ async def spawn_processing_threads(data):
def process_data(data):
to_store = []
sentiment_time = 0.0
regex_time = 0.0
polyglot_time = 0.0
date_time = 0.0
nlp_time = 0.0
normalise_time = 0.0
hash_time = 0.0
normal2_time = 0.0
soup_time = 0.0
total_time = 0.0
# Initialise sentiment analyser
analyzer = SentimentIntensityAnalyzer()
for msg in data:
total_start = time.process_time()
# normalise fields
start = time.process_time()
for key, value in list(msg.items()):
if value is None:
del msg[key]
time_took = (time.process_time() - start) * 1000
normalise_time += time_took
# Remove invalid UTF-8 characters
# IRC and Discord
start = time.process_time()
if "msg" in msg:
msg["msg"] = RE_BAD_CHARS.sub("", msg["msg"])
# 4chan - since we change the attributes below
if "com" in msg:
msg["com"] = RE_BAD_CHARS.sub("", msg["com"])
time_took = (time.process_time() - start) * 1000
regex_time += time_took
if msg["src"] == "4ch":
board = msg["net"]
thread = msg["channel"]
# Calculate hash for post
start = time.process_time()
post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
hash = siphash(hash_key, post_normalised)
hash = str(hash)
@ -169,11 +188,18 @@ def process_data(data):
else:
msg["type"] = "update"
db.r.set(redis_key, hash)
time_took = (time.process_time() - start) * 1000
hash_time += time_took
start = time.process_time()
for key2, value in list(msg.items()):
if key2 in ATTRMAP:
msg[ATTRMAP[key2]] = msg[key2]
del msg[key2]
time_took = (time.process_time() - start) * 1000
normal2_time += time_took
start = time.process_time()
if "ts" in msg:
old_time = msg["ts"]
# '08/30/22(Tue)02:25:37'
@ -187,15 +213,22 @@ def process_data(data):
msg["ts"] = new_ts
else:
raise Exception("No TS in msg")
time_took = (time.process_time() - start) * 1000
date_time += time_took
start = time.process_time()
if "msg" in msg:
soup = BeautifulSoup(msg["msg"], "html.parser")
msg_str = soup.get_text(separator="\n")
msg["msg"] = msg_str
time_took = (time.process_time() - start) * 1000
soup_time += time_took
# Annotate sentiment/NLP
if "msg" in msg:
RE_BAD_CHARS.sub("", msg["msg"])
# RE_BAD_CHARS.sub("", msg["msg"])
# Language
start = time.process_time()
text = Text(msg["msg"])
try:
lang_code = text.language.code
@ -206,22 +239,45 @@ def process_data(data):
log.error(f"Error detecting language: {e}")
# So below block doesn't fail
lang_code = None
time_took = (time.process_time() - start) * 1000
polyglot_time += time_took
# Blatant discrimination
if lang_code == "en":
# Sentiment
start = time.process_time()
vs = analyzer.polarity_scores(str(msg["msg"]))
addendum = vs["compound"]
msg["sentiment"] = addendum
time_took = (time.process_time() - start) * 1000
sentiment_time += time_took
# Tokens
n = nlp(msg["msg"])
for tag in TAGS:
tag_name = tag.lower()
tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
msg[f"words_{tag_name}"] = tags_flag
# Tokens
start = time.process_time()
tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)
msg["tokens"] = tokens
# n = nlp(msg["msg"])
# for tag in TAGS:
# tag_name = tag.lower()
# tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
# msg[f"words_{tag_name}"] = tags_flag
time_took = (time.process_time() - start) * 1000
nlp_time += time_took
# Add the mutated message to the return buffer
to_store.append(msg)
total_time += (time.process_time() - total_start) * 1000
log.debug("=====================================")
log.debug(f"Sentiment: {sentiment_time}")
log.debug(f"Regex: {regex_time}")
log.debug(f"Polyglot: {polyglot_time}")
log.debug(f"Date: {date_time}")
log.debug(f"NLP: {nlp_time}")
log.debug(f"Normalise: {normalise_time}")
log.debug(f"Hash: {hash_time}")
log.debug(f"Normal2: {normal2_time}")
log.debug(f"Soup: {soup_time}")
log.debug(f"Total: {total_time}")
log.debug("=====================================")
return to_store

View File

@ -16,7 +16,8 @@ pycld2
morfessor
six
nltk
spacy
#spacy
gensim
python-Levenshtein
orjson
uvloop