356 lines
13 KiB
Python
356 lines
13 KiB
Python
from django.conf import settings
|
|
from opensearchpy import OpenSearch
|
|
from opensearchpy.exceptions import RequestError
|
|
|
|
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
|
|
|
|
|
|
def initialise_opensearch():
|
|
"""
|
|
Inititialise the OpenSearch API endpoint.
|
|
"""
|
|
auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD)
|
|
client = OpenSearch(
|
|
# fmt: off
|
|
hosts=[{"host": settings.OPENSEARCH_URL,
|
|
"port": settings.OPENSEARCH_PORT}],
|
|
http_compress=False, # enables gzip compression for request bodies
|
|
http_auth=auth,
|
|
# client_cert = client_cert_path,
|
|
# client_key = client_key_path,
|
|
use_ssl=settings.OPENSEARCH_TLS,
|
|
verify_certs=False,
|
|
ssl_assert_hostname=False,
|
|
ssl_show_warn=False,
|
|
# a_certs=ca_certs_path,
|
|
)
|
|
return client
|
|
|
|
|
|
client = initialise_opensearch()
|
|
|
|
|
|
def annotate_results(results_parsed):
|
|
"""
|
|
Accept a list of dict objects, search for the number of channels and users.
|
|
Add them to the object.
|
|
Mutate it in place. Does not return anything.
|
|
"""
|
|
# Figure out items with net (not discord)
|
|
nets = set()
|
|
for x in results_parsed:
|
|
if "net" in x:
|
|
nets.add(x["net"])
|
|
|
|
for net in nets:
|
|
# Annotate the online attribute from Threshold
|
|
nicks = [
|
|
x["nick"] for x in results_parsed if x["src"] == "irc" and x["net"] == net
|
|
]
|
|
channels = [
|
|
x["channel"]
|
|
for x in results_parsed
|
|
if x["src"] == "irc" and x["net"] == net
|
|
]
|
|
online_info = annotate_online(net, nicks)
|
|
# Annotate the number of users in the channel
|
|
num_users = annotate_num_users(net, channels)
|
|
# Annotate the number channels the user is on
|
|
num_chans = annotate_num_chans(net, nicks)
|
|
for item in results_parsed:
|
|
if "net" in item:
|
|
if item["net"] == net:
|
|
if "nick" in item:
|
|
if item["nick"] in online_info:
|
|
item["online"] = online_info[item["nick"]]
|
|
if "channel" in item:
|
|
if item["channel"] in num_users:
|
|
item["num_users"] = num_users[item["channel"]]
|
|
if "nick" in item:
|
|
if item["nick"] in num_chans:
|
|
item["num_chans"] = num_chans[item["nick"]]
|
|
|
|
|
|
def filter_blacklisted(user, response):
|
|
"""
|
|
Low level filter to take the raw OpenSearch response and remove
|
|
objects from it we want to keep secret.
|
|
Does not return, the object is mutated in place.
|
|
"""
|
|
response["redacted"] = 0
|
|
response["exemption"] = None
|
|
if user.is_superuser:
|
|
response["exemption"] = True
|
|
# is_anonymous = isinstance(user, AnonymousUser)
|
|
# For every hit from ES
|
|
for index, item in enumerate(list(response["hits"]["hits"])):
|
|
# For every blacklisted type
|
|
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
|
|
# Check this field we are matching exists
|
|
if blacklisted_type in item["_source"].keys():
|
|
content = item["_source"][blacklisted_type]
|
|
# For every item in the blacklisted array for the type
|
|
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
|
|
blacklisted_type
|
|
]:
|
|
if blacklisted_item == str(content):
|
|
# Remove the item
|
|
if item in response["hits"]["hits"]:
|
|
# Let the UI know something was redacted
|
|
if (
|
|
"exemption"
|
|
not in response["hits"]["hits"][index]["_source"]
|
|
):
|
|
response["redacted"] += 1
|
|
# Anonymous
|
|
if user.is_anonymous:
|
|
# Just set it to none so the index is not off
|
|
response["hits"]["hits"][index] = None
|
|
else:
|
|
if not user.is_superuser:
|
|
response["hits"]["hits"][index] = None
|
|
else:
|
|
response["hits"]["hits"][index]["_source"][
|
|
"exemption"
|
|
] = True
|
|
|
|
# Actually get rid of all the things we set to None
|
|
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
|
|
|
|
|
|
def run_main_query(client, user, query, custom_query=False, index=None, size=None):
|
|
"""
|
|
Low level helper to run an ES query.
|
|
Accept a user to pass it to the filter, so we can
|
|
avoid filtering for superusers.
|
|
Accept fields and size, for the fields we want to match and the
|
|
number of results to return.
|
|
"""
|
|
if not index:
|
|
index = settings.OPENSEARCH_INDEX_MAIN
|
|
if custom_query:
|
|
search_query = query
|
|
else:
|
|
search_query = construct_query(query, size)
|
|
try:
|
|
response = client.search(body=search_query, index=index)
|
|
except RequestError as err:
|
|
print("OpenSearch error", err)
|
|
return False
|
|
filter_blacklisted(user, response)
|
|
return response
|
|
|
|
|
|
def query_results(request, size=None):
|
|
"""
|
|
API helper to alter the OpenSearch return format into something
|
|
a bit better to parse.
|
|
Accept a HTTP request object. Run the query, and annotate the
|
|
results with the other data we have.
|
|
"""
|
|
# is_anonymous = isinstance(request.user, AnonymousUser)
|
|
message = None
|
|
message_class = None
|
|
add_bool = []
|
|
add_top = []
|
|
add_top_negative = []
|
|
sort = None
|
|
if request.user.is_anonymous:
|
|
sizes = settings.OPENSEARCH_MAIN_SIZES_ANON
|
|
else:
|
|
sizes = settings.OPENSEARCH_MAIN_SIZES
|
|
if not size:
|
|
if "size" in request.POST:
|
|
size = request.POST["size"]
|
|
if size not in sizes:
|
|
message = "Size is not permitted"
|
|
message_class = "danger"
|
|
return {"message": message, "class": message_class}
|
|
if "source" in request.POST:
|
|
source = request.POST["source"]
|
|
if source not in settings.OPENSEARCH_MAIN_SOURCES:
|
|
message = "Invalid source"
|
|
message_class = "danger"
|
|
return {"message": message, "class": message_class}
|
|
if source != "all":
|
|
add_bool.append({"src": source})
|
|
|
|
if "dates" in request.POST:
|
|
dates = request.POST["dates"]
|
|
spl = dates.split(" - ")
|
|
if all(spl):
|
|
spl = [f"{x.replace(' ', 'T')}Z" for x in spl]
|
|
if not len(spl) == 2:
|
|
message = "Invalid dates"
|
|
message_class = "danger"
|
|
return {"message": message, "class": message_class}
|
|
from_ts, to_ts = spl
|
|
range_query = {
|
|
"range": {
|
|
"ts": {
|
|
"gt": from_ts,
|
|
"lt": to_ts,
|
|
}
|
|
}
|
|
}
|
|
add_top.append(range_query)
|
|
if "sorting" in request.POST:
|
|
sorting = request.POST["sorting"]
|
|
if sorting not in ("asc", "desc", "none"):
|
|
message = "Invalid sort"
|
|
message_class = "danger"
|
|
return {"message": message, "class": message_class}
|
|
if sorting in ("asc", "desc"):
|
|
sort = [
|
|
{
|
|
"ts": {
|
|
"order": sorting,
|
|
}
|
|
}
|
|
]
|
|
|
|
if "check-sentiment" in request.POST:
|
|
if "sentiment-method" not in request.POST:
|
|
message = "No sentiment method"
|
|
message_class = "danger"
|
|
return {"message": message, "class": message_class}
|
|
if "sentiment" in request.POST:
|
|
sentiment = request.POST["sentiment"]
|
|
try:
|
|
sentiment = float(sentiment)
|
|
except ValueError:
|
|
message = "Sentiment is not a float"
|
|
message_class = "danger"
|
|
return {"message": message, "class": message_class}
|
|
sentiment_method = request.POST["sentiment-method"]
|
|
range_query_compare = {"range": {"sentiment": {}}}
|
|
range_query_precise = {
|
|
"match": {
|
|
"sentiment": None,
|
|
}
|
|
}
|
|
if sentiment_method == "below":
|
|
range_query_compare["range"]["sentiment"]["lt"] = sentiment
|
|
add_top.append(range_query_compare)
|
|
elif sentiment_method == "above":
|
|
range_query_compare["range"]["sentiment"]["gt"] = sentiment
|
|
add_top.append(range_query_compare)
|
|
elif sentiment_method == "exact":
|
|
range_query_precise["match"]["sentiment"] = sentiment
|
|
add_top.append(range_query_precise)
|
|
elif sentiment_method == "nonzero":
|
|
range_query_precise["match"]["sentiment"] = 0
|
|
add_top_negative.append(range_query_precise)
|
|
|
|
if "query" in request.POST:
|
|
query = request.POST["query"]
|
|
search_query = construct_query(query, size)
|
|
if add_bool:
|
|
for item in add_bool:
|
|
search_query["query"]["bool"]["must"].append({"match": item})
|
|
if add_top:
|
|
for item in add_top:
|
|
search_query["query"]["bool"]["must"].append(item)
|
|
if add_top_negative:
|
|
for item in add_top_negative:
|
|
if "must_not" in search_query["query"]["bool"]:
|
|
search_query["query"]["bool"]["must_not"].append(item)
|
|
else:
|
|
search_query["query"]["bool"]["must_not"] = [item]
|
|
if sort:
|
|
search_query["sort"] = sort
|
|
results = run_main_query(
|
|
client,
|
|
request.user, # passed through run_main_query to filter_blacklisted
|
|
search_query,
|
|
custom_query=True,
|
|
size=size,
|
|
)
|
|
if not results:
|
|
return False
|
|
results_parsed = []
|
|
if "hits" in results.keys():
|
|
if "hits" in results["hits"]:
|
|
for item in results["hits"]["hits"]:
|
|
element = item["_source"]
|
|
element["id"] = item["_id"]
|
|
|
|
# Split the timestamp into date and time
|
|
if "ts" not in element:
|
|
if "time" in element: # will fix data later
|
|
ts = element["time"]
|
|
del element["time"]
|
|
element["ts"] = ts
|
|
if "ts" in element:
|
|
ts = element["ts"]
|
|
ts_spl = ts.split("T")
|
|
date = ts_spl[0]
|
|
time = ts_spl[1]
|
|
element["date"] = date
|
|
element["time"] = time
|
|
results_parsed.append(element)
|
|
|
|
annotate_results(results_parsed)
|
|
|
|
context = {
|
|
"query": query,
|
|
"results": results_parsed,
|
|
"card": results["hits"]["total"]["value"],
|
|
"took": results["took"],
|
|
"redacted": results["redacted"],
|
|
"exemption": results["exemption"],
|
|
}
|
|
return context
|
|
|
|
|
|
def query_single_result(request):
|
|
context = query_results(request, 1)
|
|
dedup_set = {item["nick"] for item in context["results"]}
|
|
if dedup_set:
|
|
context["item"] = context["results"][0]
|
|
|
|
return (1, context)
|
|
|
|
|
|
def construct_query(query, size):
|
|
"""
|
|
Accept some query parameters and construct an OpenSearch query.
|
|
"""
|
|
if not size:
|
|
size = 5
|
|
query = {
|
|
"size": size,
|
|
"query": {
|
|
"bool": {
|
|
"must": [
|
|
{
|
|
"query_string": {
|
|
"query": query,
|
|
# "fields": fields,
|
|
# "default_field": "msg",
|
|
# "type": "best_fields",
|
|
"fuzziness": "AUTO",
|
|
"fuzzy_transpositions": True,
|
|
"fuzzy_max_expansions": 50,
|
|
"fuzzy_prefix_length": 0,
|
|
# "minimum_should_match": 1,
|
|
"default_operator": "or",
|
|
"analyzer": "standard",
|
|
"lenient": True,
|
|
"boost": 1,
|
|
"allow_leading_wildcard": True,
|
|
# "enable_position_increments": False,
|
|
"phrase_slop": 3,
|
|
# "max_determinized_states": 10000,
|
|
"quote_field_suffix": "",
|
|
"quote_analyzer": "standard",
|
|
"analyze_wildcard": False,
|
|
"auto_generate_synonyms_phrase_query": True,
|
|
}
|
|
}
|
|
]
|
|
}
|
|
},
|
|
}
|
|
return query
|