neptune/core/lib/opensearch.py

285 lines
10 KiB
Python
Raw Normal View History

2022-07-21 12:47:02 +00:00
from django.conf import settings
from opensearchpy import OpenSearch
2022-07-21 12:51:27 +00:00
from opensearchpy.exceptions import RequestError
2022-07-21 12:47:02 +00:00
2022-07-21 12:51:55 +00:00
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
2022-07-21 12:47:02 +00:00
def initialise_opensearch():
2022-07-21 12:51:55 +00:00
"""
Inititialise the OpenSearch API endpoint.
"""
2022-07-21 12:47:02 +00:00
auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD)
client = OpenSearch(
2022-07-21 12:47:10 +00:00
# fmt: off
2022-07-21 12:47:02 +00:00
hosts=[{"host": settings.OPENSEARCH_URL,
2022-07-21 12:47:10 +00:00
"port": settings.OPENSEARCH_PORT}],
2022-07-21 12:47:02 +00:00
http_compress=False, # enables gzip compression for request bodies
http_auth=auth,
# client_cert = client_cert_path,
# client_key = client_key_path,
use_ssl=settings.OPENSEARCH_TLS,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
# a_certs=ca_certs_path,
)
return client
2022-07-21 12:51:55 +00:00
client = initialise_opensearch()
def annotate_results(results_parsed):
"""
Accept a list of dict objects, search for the number of channels and users.
Add them to the object.
Mutate it in place. Does not return anything.
"""
# Figure out items with net (not discord)
nets = set()
for x in results_parsed:
if "net" in x:
nets.add(x["net"])
for net in nets:
# Annotate the online attribute from Threshold
nicks = [
x["nick"] for x in results_parsed if x["src"] == "irc" and x["net"] == net
]
channels = [
x["channel"]
for x in results_parsed
if x["src"] == "irc" and x["net"] == net
]
online_info = annotate_online(net, nicks)
2022-07-21 12:51:55 +00:00
# Annotate the number of users in the channel
num_users = annotate_num_users(net, channels)
2022-07-21 12:51:55 +00:00
# Annotate the number channels the user is on
num_chans = annotate_num_chans(net, nicks)
2022-07-21 12:51:55 +00:00
for item in results_parsed:
2022-07-29 21:41:53 +00:00
if "net" in item:
if item["net"] == net:
if "nick" in item:
if item["nick"] in online_info:
item["online"] = online_info[item["nick"]]
if "channel" in item:
if item["channel"] in num_users:
item["num_users"] = num_users[item["channel"]]
if "nick" in item:
if item["nick"] in num_chans:
item["num_chans"] = num_chans[item["nick"]]
2022-07-21 12:51:55 +00:00
def filter_blacklisted(user, response):
"""
Low level filter to take the raw OpenSearch response and remove
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
2022-07-21 12:51:55 +00:00
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
2022-07-21 12:51:55 +00:00
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if blacklisted_type in item["_source"].keys():
content = item["_source"][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item == str(content):
2022-07-21 12:51:55 +00:00
# Remove the item
if item in response["hits"]["hits"]:
2022-08-02 21:22:22 +00:00
# Anonymous
if user.is_anonymous:
2022-07-21 12:51:55 +00:00
response["hits"]["hits"].remove(item)
2022-08-02 21:22:22 +00:00
else:
if not user.is_superuser:
response["hits"]["hits"].remove(item)
else:
response["hits"]["hits"][index]["_source"][
"exemption"
] = True
2022-07-21 12:51:55 +00:00
# Let the UI know something was redacted
response["redacted"] += 1
2022-07-21 12:52:41 +00:00
def run_main_query(client, user, query, custom_query=False, index=None, size=None):
2022-07-21 12:51:55 +00:00
"""
Low level helper to run an ES query.
Accept a user to pass it to the filter, so we can
avoid filtering for superusers.
Accept fields and size, for the fields we want to match and the
number of results to return.
"""
2022-07-21 12:52:41 +00:00
if not index:
index = settings.OPENSEARCH_INDEX_MAIN
if custom_query:
search_query = query
else:
search_query = construct_query(query, size)
2022-07-21 12:51:55 +00:00
try:
2022-07-21 12:52:41 +00:00
response = client.search(body=search_query, index=index)
except RequestError as err:
print("OpenSearch error", err)
2022-07-21 12:51:55 +00:00
return False
filter_blacklisted(user, response)
return response
def query_results(request, size=None):
"""
API helper to alter the OpenSearch return format into something
a bit better to parse.
Accept a HTTP request object. Run the query, and annotate the
results with the other data we have.
"""
# is_anonymous = isinstance(request.user, AnonymousUser)
message = None
message_class = None
add_bool = []
if request.user.is_anonymous:
sizes = settings.OPENSEARCH_MAIN_SIZES_ANON
else:
sizes = settings.OPENSEARCH_MAIN_SIZES
2022-07-21 12:51:55 +00:00
if not size:
if "size" in request.POST:
size = request.POST["size"]
if size not in sizes:
message = "Size is not permitted"
message_class = "danger"
return {"message": message, "class": message_class}
if "source" in request.POST:
source = request.POST["source"]
if source not in settings.OPENSEARCH_MAIN_SOURCES:
message = "Invalid source"
message_class = "danger"
return {"message": message, "class": message_class}
if source != "all":
add_bool.append({"src": source})
if "check-sentiment" in request.POST:
if "sentiment" in request.POST:
sentiment = request.POST["sentiment"]
try:
sentiment = float(sentiment)
except ValueError:
message = "Sentiment is not a float"
message_class = "danger"
return {"message": message, "class": message_class}
2022-07-21 12:51:55 +00:00
if "query" in request.POST:
query = request.POST["query"]
search_query = construct_query(query, size)
if add_bool:
for item in add_bool:
search_query["query"]["bool"]["must"].append({"match": item})
2022-07-21 12:51:55 +00:00
results = run_main_query(
client,
request.user, # passed through run_main_query to filter_blacklisted
search_query,
custom_query=True,
size=size,
2022-07-21 12:51:55 +00:00
)
if not results:
return False
results_parsed = []
if "hits" in results.keys():
if "hits" in results["hits"]:
for item in results["hits"]["hits"]:
element = item["_source"]
element["id"] = item["_id"]
# Split the timestamp into date and time
2022-08-03 06:20:30 +00:00
if "ts" not in element:
if "time" in element: # will fix data later
ts = element["time"]
del element["time"]
element["ts"] = ts
if "ts" in element:
ts = element["ts"]
ts_spl = ts.split("T")
date = ts_spl[0]
time = ts_spl[1]
element["date"] = date
element["time"] = time
2022-07-21 12:51:55 +00:00
results_parsed.append(element)
annotate_results(results_parsed)
context = {
"query": query,
"results": results_parsed,
"card": results["hits"]["total"]["value"],
"took": results["took"],
"redacted": results["redacted"],
"exemption": results["exemption"],
}
return context
def query_single_result(request):
context = query_results(request, 1)
dedup_set = {item["nick"] for item in context["results"]}
if dedup_set:
2022-07-21 12:51:55 +00:00
context["item"] = context["results"][0]
2022-07-21 12:51:55 +00:00
return (1, context)
def construct_query(query, size):
"""
Accept some query parameters and construct an OpenSearch query.
"""
if not size:
size = 5
2022-07-21 12:47:02 +00:00
query = {
"size": size,
2022-07-21 12:47:02 +00:00
"query": {
"bool": {
"must": [
{
"query_string": {
"query": query,
# "fields": fields,
# "default_field": "msg",
# "type": "best_fields",
"fuzziness": "AUTO",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
# "minimum_should_match": 1,
"default_operator": "or",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"allow_leading_wildcard": True,
# "enable_position_increments": False,
"phrase_slop": 3,
# "max_determinized_states": 10000,
"quote_field_suffix": "",
"quote_analyzer": "standard",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": True,
}
}
]
2022-07-21 12:47:02 +00:00
}
},
2022-07-21 12:51:38 +00:00
"sort": [
{
"ts": {
"order": "desc",
}
}
],
2022-07-21 12:47:02 +00:00
}
return query