Fix merge conflict
commit
cb11ce9b12
@ -0,0 +1,52 @@
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
# Java tuning
|
||||
DRUID_XMX=1g
|
||||
DRUID_XMS=1g
|
||||
DRUID_MAXNEWSIZE=250m
|
||||
DRUID_NEWSIZE=250m
|
||||
DRUID_MAXDIRECTMEMORYSIZE=6172m
|
||||
|
||||
druid_emitter_logging_logLevel=debug
|
||||
|
||||
druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-kafka-indexing-service"]
|
||||
|
||||
druid_zk_service_host=zookeeper
|
||||
|
||||
druid_metadata_storage_host=
|
||||
druid_metadata_storage_type=postgresql
|
||||
druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid
|
||||
druid_metadata_storage_connector_user=druid
|
||||
druid_metadata_storage_connector_password=FoolishPassword
|
||||
|
||||
druid_coordinator_balancer_strategy=cachingCost
|
||||
|
||||
druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
|
||||
druid_indexer_fork_property_druid_processing_buffer_sizeBytes=256MiB
|
||||
|
||||
druid_storage_type=local
|
||||
druid_storage_storageDirectory=/opt/shared/segments
|
||||
druid_indexer_logs_type=file
|
||||
druid_indexer_logs_directory=/opt/shared/indexing-logs
|
||||
|
||||
druid_processing_numThreads=2
|
||||
druid_processing_numMergeBuffers=2
|
||||
|
||||
DRUID_LOG4J=<?xml version="1.0" encoding="UTF-8" ?><Configuration status="WARN"><Appenders><Console name="Console" target="SYSTEM_OUT"><PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/></Console></Appenders><Loggers><Root level="info"><AppenderRef ref="Console"/></Root><Logger name="org.apache.druid.jetty.RequestLog" additivity="false" level="DEBUG"><AppenderRef ref="Console"/></Logger></Loggers></Configuration>
|
@ -0,0 +1,202 @@
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
|
||||
# For key generation
|
||||
import string
|
||||
|
||||
# Squash errors
|
||||
import warnings
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
# For timestamp processing
|
||||
from datetime import datetime
|
||||
from math import ceil
|
||||
|
||||
import orjson
|
||||
|
||||
# Tokenisation
|
||||
import spacy
|
||||
|
||||
# For 4chan message parsing
|
||||
from bs4 import BeautifulSoup
|
||||
from numpy import array_split
|
||||
from polyglot.detect.base import logger as polyglot_logger
|
||||
|
||||
# For NLP
|
||||
from polyglot.text import Text
|
||||
from pycld2 import error as cld2_error
|
||||
from siphashc import siphash
|
||||
|
||||
# For sentiment
|
||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||
|
||||
import db
|
||||
import util
|
||||
|
||||
# 4chan schema
|
||||
from schemas.ch4_s import ATTRMAP
|
||||
|
||||
# For tokenisation
|
||||
# from gensim.parsing.preprocessing import (
|
||||
# strip_tags,
|
||||
# strip_punctuation,
|
||||
# strip_numeric,
|
||||
# stem_text,
|
||||
# strip_multiple_whitespaces,
|
||||
# strip_non_alphanum,
|
||||
# remove_stopwords,
|
||||
# strip_short,
|
||||
# preprocess_string,
|
||||
# )
|
||||
|
||||
# CUSTOM_FILTERS = [
|
||||
# lambda x: x.lower(),
|
||||
# strip_tags, #
|
||||
# strip_punctuation, #
|
||||
# strip_multiple_whitespaces,
|
||||
# strip_numeric,
|
||||
# remove_stopwords,
|
||||
# strip_short,
|
||||
# #stem_text,
|
||||
# strip_non_alphanum, #
|
||||
# ]
|
||||
|
||||
# Squash errors
|
||||
polyglot_logger.setLevel("ERROR")
|
||||
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
|
||||
|
||||
|
||||
TAGS = ["NOUN", "ADJ", "VERB", "ADV"]
|
||||
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
|
||||
|
||||
|
||||
log = util.get_logger("process")
|
||||
|
||||
# Maximum number of CPU threads to use for post processing
|
||||
CPU_THREADS = os.cpu_count()
|
||||
|
||||
p = ProcessPoolExecutor(CPU_THREADS)
|
||||
|
||||
|
||||
def get_hash_key():
|
||||
hash_key = db.r.get("hashing_key")
|
||||
if not hash_key:
|
||||
letters = string.ascii_lowercase
|
||||
hash_key = "".join(random.choice(letters) for i in range(16))
|
||||
log.debug(f"Created new hash key: {hash_key}")
|
||||
db.r.set("hashing_key", hash_key)
|
||||
else:
|
||||
hash_key = hash_key.decode("ascii")
|
||||
log.debug(f"Decoded hash key: {hash_key}")
|
||||
return hash_key
|
||||
|
||||
|
||||
hash_key = get_hash_key()
|
||||
|
||||
|
||||
@asyncio.coroutine
|
||||
async def spawn_processing_threads(data):
|
||||
len_data = len(data)
|
||||
log.debug(f"Spawning processing threads for batch of {len_data} messages")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
tasks = []
|
||||
|
||||
if len(data) < CPU_THREADS:
|
||||
split_data = [data]
|
||||
else:
|
||||
msg_per_core = int(len(data) / CPU_THREADS)
|
||||
split_data = array_split(data, ceil(len(data) / msg_per_core))
|
||||
for index, split in enumerate(split_data):
|
||||
log.debug(f"Delegating processing of {len(split)} messages to thread {index}")
|
||||
task = loop.run_in_executor(p, process_data, data)
|
||||
tasks.append(task)
|
||||
|
||||
results = [await task for task in tasks]
|
||||
log.debug(f"Results from processing of {len_data} messages: {len(results)}")
|
||||
|
||||
# Join the results back from the split list
|
||||
flat_list = [item for sublist in results for item in sublist]
|
||||
await db.store_kafka_batch(flat_list)
|
||||
|
||||
log.debug(f"Finished processing {len_data} messages")
|
||||
|
||||
|
||||
def process_data(data):
|
||||
to_store = []
|
||||
|
||||
# Initialise sentiment analyser
|
||||
analyzer = SentimentIntensityAnalyzer()
|
||||
for msg in data:
|
||||
if msg["src"] == "4ch":
|
||||
board = msg["net"]
|
||||
thread = msg["channel"]
|
||||
|
||||
# Calculate hash for post
|
||||
post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
|
||||
hash = siphash(hash_key, post_normalised)
|
||||
hash = str(hash)
|
||||
redis_key = f"cache.{board}.{thread}.{msg['no']}"
|
||||
key_content = db.r.get(redis_key)
|
||||
if key_content:
|
||||
key_content = key_content.decode("ascii")
|
||||
if key_content == hash:
|
||||
# This deletes the message since the append at the end won't be hit
|
||||
continue
|
||||
else:
|
||||
msg["type"] = "update"
|
||||
db.r.set(redis_key, hash)
|
||||
for key2, value in list(msg.items()):
|
||||
if key2 in ATTRMAP:
|
||||
msg[ATTRMAP[key2]] = msg[key2]
|
||||
del msg[key2]
|
||||
if "ts" in msg:
|
||||
old_time = msg["ts"]
|
||||
# '08/30/22(Tue)02:25:37'
|
||||
time_spl = old_time.split(":")
|
||||
if len(time_spl) == 3:
|
||||
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
|
||||
else:
|
||||
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
|
||||
# new_ts = old_ts.isoformat()
|
||||
new_ts = int(old_ts.timestamp())
|
||||
msg["ts"] = new_ts
|
||||
else:
|
||||
raise Exception("No TS in msg")
|
||||
if "msg" in msg:
|
||||
soup = BeautifulSoup(msg["msg"], "html.parser")
|
||||
msg_str = soup.get_text(separator="\n")
|
||||
msg["msg"] = msg_str
|
||||
# Annotate sentiment/NLP
|
||||
if "msg" in msg:
|
||||
# Language
|
||||
text = Text(msg["msg"])
|
||||
try:
|
||||
lang_code = text.language.code
|
||||
lang_name = text.language.name
|
||||
msg["lang_code"] = lang_code
|
||||
msg["lang_name"] = lang_name
|
||||
except cld2_error as e:
|
||||
log.error(f"Error detecting language: {e}")
|
||||
# So below block doesn't fail
|
||||
lang_code = None
|
||||
|
||||
# Blatant discrimination
|
||||
if lang_code == "en":
|
||||
|
||||
# Sentiment
|
||||
vs = analyzer.polarity_scores(str(msg["msg"]))
|
||||
addendum = vs["compound"]
|
||||
msg["sentiment"] = addendum
|
||||
|
||||
# Tokens
|
||||
n = nlp(msg["msg"])
|
||||
for tag in TAGS:
|
||||
tag_name = tag.lower()
|
||||
tags_flag = [token.lemma_ for token in n if token.pos_ == tag]
|
||||
msg[f"words_{tag_name}"] = tags_flag
|
||||
|
||||
# Add the mutated message to the return buffer
|
||||
to_store.append(msg)
|
||||
return to_store
|
Loading…
Reference in New Issue