neptune/core/db/__init__.py

441 lines
16 KiB
Python
Raw Normal View History

2022-09-27 14:15:08 +00:00
import random
import string
import time
from abc import ABC, abstractmethod
2022-09-27 14:15:08 +00:00
from math import floor, log10
import orjson
from django.conf import settings
from siphashc import siphash
from core import r
from core.db.processing import annotate_results
from core.util import logs
def remove_defaults(query_params):
for field, value in list(query_params.items()):
if field in settings.DRILLDOWN_DEFAULT_PARAMS:
if value == settings.DRILLDOWN_DEFAULT_PARAMS[field]:
del query_params[field]
def add_defaults(query_params):
for field, value in settings.DRILLDOWN_DEFAULT_PARAMS.items():
if field not in query_params:
query_params[field] = value
def dedup_list(data, check_keys):
"""
Remove duplicate dictionaries from list.
"""
seen = set()
out = []
dup_count = 0
for x in data:
dedupeKey = tuple(x[k] for k in check_keys if k in x)
if dedupeKey in seen:
dup_count += 1
continue
if dup_count > 0:
out.append({"type": "control", "hidden": dup_count})
dup_count = 0
out.append(x)
seen.add(dedupeKey)
if dup_count > 0:
out.append({"type": "control", "hidden": dup_count})
return out
class StorageBackend(ABC):
2022-09-27 14:15:08 +00:00
def __init__(self, name):
self.log = logs.get_logger(name)
self.log.info(f"Initialising storage backend {name}")
self.initialise_caching()
# self.initialise()
2022-09-27 14:15:08 +00:00
@abstractmethod
2022-09-27 14:15:08 +00:00
def initialise(self, **kwargs):
pass
2022-09-27 14:15:08 +00:00
def initialise_caching(self):
hash_key = r.get("cache_hash_key")
if not hash_key:
letters = string.ascii_lowercase
hash_key = "".join(random.choice(letters) for i in range(16))
self.log.debug(f"Created new hash key: {hash_key}")
r.set("cache_hash_key", hash_key)
else:
hash_key = hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {hash_key}")
self.hash_key = hash_key
@abstractmethod
2022-09-27 14:15:08 +00:00
def construct_query(self, **kwargs):
pass
2022-09-27 14:15:08 +00:00
2022-11-23 18:15:42 +00:00
def parse_query(self, query_params, tags, size, custom_query, add_bool, **kwargs):
2022-09-30 06:22:22 +00:00
query_created = False
2022-09-27 14:15:08 +00:00
if "query" in query_params:
query = query_params["query"]
2022-11-23 18:15:42 +00:00
search_query = self.construct_query(query, size, **kwargs)
2022-09-27 14:15:08 +00:00
query_created = True
else:
if custom_query:
search_query = custom_query
2022-09-30 06:22:22 +00:00
else:
2022-11-23 18:15:42 +00:00
search_query = self.construct_query(None, size, blank=True, **kwargs)
2022-09-27 14:15:08 +00:00
if tags:
# Get a blank search query
if not query_created:
2022-11-23 18:15:42 +00:00
search_query = self.construct_query(None, size, blank=True, **kwargs)
2022-09-27 14:15:08 +00:00
query_created = True
2022-09-30 06:22:22 +00:00
for item in tags:
for tagname, tagvalue in item.items():
add_bool.append({tagname: tagvalue})
2022-09-27 14:15:08 +00:00
2023-01-15 23:02:13 +00:00
bypass_check = kwargs.get("bypass_check", False)
if not bypass_check:
valid = self.check_valid_query(query_params, custom_query, **kwargs)
if isinstance(valid, dict):
return valid
2022-09-30 06:22:22 +00:00
return search_query
def check_valid_query(self, query_params, custom_query):
2022-09-27 14:15:08 +00:00
required_any = ["query", "tags"]
if not any([field in query_params.keys() for field in required_any]):
if not custom_query:
message = "Empty query!"
message_class = "warning"
return {"message": message, "class": message_class}
@abstractmethod
def run_query(self, **kwargs):
pass
2022-09-30 06:22:22 +00:00
2022-09-27 14:15:08 +00:00
def filter_blacklisted(self, user, response):
"""
2022-09-30 06:22:22 +00:00
Low level filter to take the raw search response and remove
2022-09-27 14:15:08 +00:00
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
# For every blacklisted type
2022-11-21 07:20:29 +00:00
for blacklisted_type in settings.ELASTICSEARCH_BLACKLISTED.keys():
2022-09-27 14:15:08 +00:00
# Check this field we are matching exists
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
if blacklisted_type in item[data_index].keys():
content = item[data_index][blacklisted_type]
# For every item in the blacklisted array for the type
2022-11-23 18:15:42 +00:00
for blacklisted_item in settings.BLACKLISTED[blacklisted_type]:
2022-09-27 14:15:08 +00:00
if blacklisted_item == str(content):
# Remove the item
if item in response["hits"]["hits"]:
# Let the UI know something was redacted
if (
"exemption"
not in response["hits"]["hits"][index][data_index]
):
response["redacted"] += 1
# Anonymous
if user.is_anonymous:
# Just set it to none so the index is not off
response["hits"]["hits"][index] = None
else:
if not user.has_perm("core.bypass_blacklist"):
response["hits"]["hits"][index] = None
else:
response["hits"]["hits"][index][data_index][
"exemption"
] = True
# Actually get rid of all the things we set to None
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
2024-12-29 17:37:23 +00:00
def add_bool(self, search_query, add_bool):
"""
Add the specified boolean matches to search query.
"""
if not add_bool:
return
for item in add_bool:
search_query["query"]["bool"]["must"].append({"match_phrase": item})
def add_top(self, search_query, add_top, negative=False):
"""
Merge add_top with the base of the search_query.
"""
if not add_top:
return
if negative:
for item in add_top:
if "must_not" in search_query["query"]["bool"]:
search_query["query"]["bool"]["must_not"].append(item)
else:
search_query["query"]["bool"]["must_not"] = [item]
else:
for item in add_top:
if "query" not in search_query:
search_query["query"] = {"bool": {"must": []}}
search_query["query"]["bool"]["must"].append(item)
def schedule_check_aggregations(self, rule_object, result_map):
"""
Check the results of a scheduled query for aggregations.
"""
if rule_object.aggs is None:
return result_map
for index, (meta, result) in result_map.items():
# Default to true, if no aggs are found, we still want to match
match = True
for agg_name, (operator, number) in rule_object.aggs.items():
if agg_name in meta["aggs"]:
agg_value = meta["aggs"][agg_name]["value"]
# TODO: simplify this, match is default to True
if operator == ">":
if agg_value > number:
match = True
else:
match = False
elif operator == "<":
if agg_value < number:
match = True
else:
match = False
elif operator == "=":
if agg_value == number:
match = True
else:
match = False
else:
match = False
else:
# No aggregation found, but it is required
match = False
result_map[index][0]["aggs"][agg_name]["match"] = match
return result_map
2022-11-23 18:15:42 +00:00
def query(self, user, search_query, **kwargs):
2022-09-27 14:15:08 +00:00
# For time tracking
start = time.process_time()
if settings.CACHE:
# Sort the keys so the hash is the same
query_normalised = orjson.dumps(search_query, option=orjson.OPT_SORT_KEYS)
hash = siphash(self.hash_key, query_normalised)
cache_hit = r.get(f"query_cache.{user.id}.{hash}")
if cache_hit:
response = orjson.loads(cache_hit)
2022-09-30 06:22:22 +00:00
time_took = (time.process_time() - start) * 1000
# Round to 3 significant figures
time_took_rounded = round(
time_took, 3 - int(floor(log10(abs(time_took)))) - 1
)
return {
"object_list": response,
"took": time_took_rounded,
"cache": True,
}
2024-12-29 17:37:23 +00:00
print("S2", search_query)
2022-11-23 18:15:42 +00:00
response = self.run_query(user, search_query, **kwargs)
# For Elasticsearch
if isinstance(response, Exception):
message = f"Error: {response.info['error']['root_cause'][0]['type']}"
message_class = "danger"
return {"message": message, "class": message_class}
if "took" in response:
if response["took"] is None:
return None
2024-12-29 17:37:23 +00:00
if "error" in response:
message = f"Error: {response['error']}"
message_class = "danger"
time_took = (time.process_time() - start) * 1000
# Round to 3 significant figures
time_took_rounded = round(
time_took, 3 - int(floor(log10(abs(time_took)))) - 1
)
return {
"message": message,
"class": message_class,
"took": time_took_rounded,
}
elif len(response["hits"]["hits"]) == 0:
2022-11-23 18:15:42 +00:00
message = "No results."
message_class = "danger"
time_took = (time.process_time() - start) * 1000
# Round to 3 significant figures
time_took_rounded = round(
time_took, 3 - int(floor(log10(abs(time_took)))) - 1
)
return {
"message": message,
"class": message_class,
"took": time_took_rounded,
}
2022-11-23 18:15:42 +00:00
# For Druid
2024-12-29 17:37:23 +00:00
elif "error" in response:
2022-09-30 06:22:22 +00:00
if "errorMessage" in response:
context = {
"message": response["errorMessage"],
"class": "danger",
}
return context
else:
return response
2022-11-23 18:15:42 +00:00
# Removed for now, no point given we have restricted indexes
# self.filter_blacklisted(user, response)
2022-09-27 14:15:08 +00:00
2022-09-30 06:22:22 +00:00
# Parse the response
response_parsed = self.parse(response)
2022-09-27 14:15:08 +00:00
# Write cache
if settings.CACHE:
2022-09-30 06:22:22 +00:00
to_write_cache = orjson.dumps(response_parsed)
2022-09-27 14:15:08 +00:00
r.set(f"query_cache.{user.id}.{hash}", to_write_cache)
r.expire(f"query_cache.{user.id}.{hash}", settings.CACHE_TIMEOUT)
time_took = (time.process_time() - start) * 1000
# Round to 3 significant figures
time_took_rounded = round(time_took, 3 - int(floor(log10(abs(time_took)))) - 1)
return {"object_list": response_parsed, "took": time_took_rounded}
2024-12-29 17:37:23 +00:00
def construct_context_query(
self, index, net, channel, src, num, size, type=None, nicks=None
):
# Get the initial query
query = self.construct_query(None, size, blank=True)
extra_must = []
extra_should = []
extra_should2 = []
if num:
extra_must.append({"match_phrase": {"num": num}})
if net:
extra_must.append({"match_phrase": {"net": net}})
if channel:
extra_must.append({"match": {"channel": channel}})
if nicks:
for nick in nicks:
extra_should2.append({"match": {"nick": nick}})
types = ["msg", "notice", "action", "kick", "topic", "mode"]
fields = [
"nick",
"ident",
"host",
"channel",
"ts",
"msg",
"type",
"net",
"src",
"tokens",
]
query["fields"] = fields
if index == "internal":
fields.append("mtype")
if channel == "*status" or type == "znc":
if {"match": {"channel": channel}} in extra_must:
extra_must.remove({"match": {"channel": channel}})
extra_should2 = []
# Type is one of msg or notice
# extra_should.append({"match": {"mtype": "msg"}})
# extra_should.append({"match": {"mtype": "notice"}})
extra_should.append({"match": {"type": "znc"}})
extra_should.append({"match": {"type": "self"}})
extra_should2.append({"match": {"type": "znc"}})
extra_should2.append({"match": {"nick": channel}})
elif type == "auth":
if {"match": {"channel": channel}} in extra_must:
extra_must.remove({"match": {"channel": channel}})
extra_should2 = []
extra_should2.append({"match": {"nick": channel}})
# extra_should2.append({"match": {"mtype": "msg"}})
# extra_should2.append({"match": {"mtype": "notice"}})
extra_should.append({"match": {"type": "query"}})
extra_should2.append({"match": {"type": "self"}})
extra_should.append({"match": {"nick": channel}})
else:
for ctype in types:
extra_should.append({"match": {"mtype": ctype}})
else:
for ctype in types:
extra_should.append({"match": {"type": ctype}})
# query = {
# "index": index,
# "limit": size,
# "query": {
# "bool": {
# "must": [
# # {"equals": {"src": src}},
# # {
# # "bool": {
# # "should": [*extra_should],
# # }
# # },
# # {
# # "bool": {
# # "should": [*extra_should2],
# # }
# # },
# *extra_must,
# ]
# }
# },
# "fields": fields,
# # "_source": False,
# }
if extra_must:
for x in extra_must:
query["query"]["bool"]["must"].append(x)
if extra_should:
query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should]}})
if extra_should2:
query["query"]["bool"]["must"].append(
{"bool": {"should": [*extra_should2]}}
)
return query
@abstractmethod
2022-09-27 14:15:08 +00:00
def query_results(self, **kwargs):
pass
2022-09-27 14:15:08 +00:00
2022-09-30 06:22:22 +00:00
def process_results(self, response, **kwargs):
2022-09-27 14:15:08 +00:00
if kwargs.get("annotate"):
2022-09-30 06:22:22 +00:00
annotate_results(response)
if kwargs.get("reverse"):
2022-11-23 18:52:48 +00:00
response.reverse()
2022-09-30 06:22:22 +00:00
if kwargs.get("dedup"):
2022-12-09 07:20:28 +00:00
dedup_fields = kwargs.get("dedup_fields")
if not dedup_fields:
2022-09-30 06:22:22 +00:00
dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
response = dedup_list(response, dedup_fields)
2022-12-09 07:20:59 +00:00
return response
2022-09-27 14:15:08 +00:00
@abstractmethod
2022-09-27 14:15:08 +00:00
def parse(self, response):
pass