2022-09-27 14:15:08 +00:00
|
|
|
import random
|
|
|
|
import string
|
|
|
|
import time
|
2022-11-21 19:43:23 +00:00
|
|
|
from abc import ABC, abstractmethod
|
2022-09-27 14:15:08 +00:00
|
|
|
from math import floor, log10
|
|
|
|
|
|
|
|
import orjson
|
|
|
|
from django.conf import settings
|
|
|
|
from siphashc import siphash
|
|
|
|
|
|
|
|
from core import r
|
|
|
|
from core.db.processing import annotate_results
|
|
|
|
from core.util import logs
|
2023-01-12 07:20:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
def remove_defaults(query_params):
|
|
|
|
for field, value in list(query_params.items()):
|
|
|
|
if field in settings.DRILLDOWN_DEFAULT_PARAMS:
|
|
|
|
if value == settings.DRILLDOWN_DEFAULT_PARAMS[field]:
|
|
|
|
del query_params[field]
|
|
|
|
|
|
|
|
|
|
|
|
def add_defaults(query_params):
|
|
|
|
for field, value in settings.DRILLDOWN_DEFAULT_PARAMS.items():
|
|
|
|
if field not in query_params:
|
|
|
|
query_params[field] = value
|
|
|
|
|
|
|
|
|
|
|
|
def dedup_list(data, check_keys):
|
|
|
|
"""
|
|
|
|
Remove duplicate dictionaries from list.
|
|
|
|
"""
|
|
|
|
seen = set()
|
|
|
|
out = []
|
|
|
|
|
|
|
|
dup_count = 0
|
|
|
|
for x in data:
|
|
|
|
dedupeKey = tuple(x[k] for k in check_keys if k in x)
|
|
|
|
if dedupeKey in seen:
|
|
|
|
dup_count += 1
|
|
|
|
continue
|
|
|
|
if dup_count > 0:
|
|
|
|
out.append({"type": "control", "hidden": dup_count})
|
|
|
|
dup_count = 0
|
|
|
|
out.append(x)
|
|
|
|
seen.add(dedupeKey)
|
|
|
|
if dup_count > 0:
|
|
|
|
out.append({"type": "control", "hidden": dup_count})
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
2022-11-21 19:43:23 +00:00
|
|
|
class StorageBackend(ABC):
|
2022-09-27 14:15:08 +00:00
|
|
|
def __init__(self, name):
|
|
|
|
self.log = logs.get_logger(name)
|
|
|
|
self.log.info(f"Initialising storage backend {name}")
|
|
|
|
|
|
|
|
self.initialise_caching()
|
2023-01-16 07:20:37 +00:00
|
|
|
# self.initialise()
|
2022-09-27 14:15:08 +00:00
|
|
|
|
2022-11-21 19:43:23 +00:00
|
|
|
@abstractmethod
|
2022-09-27 14:15:08 +00:00
|
|
|
def initialise(self, **kwargs):
|
2022-11-21 19:43:23 +00:00
|
|
|
pass
|
2022-09-27 14:15:08 +00:00
|
|
|
|
|
|
|
def initialise_caching(self):
|
|
|
|
hash_key = r.get("cache_hash_key")
|
|
|
|
if not hash_key:
|
|
|
|
letters = string.ascii_lowercase
|
|
|
|
hash_key = "".join(random.choice(letters) for i in range(16))
|
|
|
|
self.log.debug(f"Created new hash key: {hash_key}")
|
|
|
|
r.set("cache_hash_key", hash_key)
|
|
|
|
else:
|
|
|
|
hash_key = hash_key.decode("ascii")
|
|
|
|
self.log.debug(f"Decoded hash key: {hash_key}")
|
|
|
|
self.hash_key = hash_key
|
|
|
|
|
2022-11-21 19:43:23 +00:00
|
|
|
@abstractmethod
|
2022-09-27 14:15:08 +00:00
|
|
|
def construct_query(self, **kwargs):
|
2022-11-21 19:43:23 +00:00
|
|
|
pass
|
2022-09-27 14:15:08 +00:00
|
|
|
|
2022-11-23 18:15:42 +00:00
|
|
|
def parse_query(self, query_params, tags, size, custom_query, add_bool, **kwargs):
|
2022-09-30 06:22:22 +00:00
|
|
|
query_created = False
|
2022-09-27 14:15:08 +00:00
|
|
|
if "query" in query_params:
|
|
|
|
query = query_params["query"]
|
2022-11-23 18:15:42 +00:00
|
|
|
search_query = self.construct_query(query, size, **kwargs)
|
2022-09-27 14:15:08 +00:00
|
|
|
query_created = True
|
|
|
|
else:
|
|
|
|
if custom_query:
|
|
|
|
search_query = custom_query
|
2022-09-30 06:22:22 +00:00
|
|
|
else:
|
2022-11-23 18:15:42 +00:00
|
|
|
search_query = self.construct_query(None, size, blank=True, **kwargs)
|
2022-09-27 14:15:08 +00:00
|
|
|
|
|
|
|
if tags:
|
|
|
|
# Get a blank search query
|
|
|
|
if not query_created:
|
2022-11-23 18:15:42 +00:00
|
|
|
search_query = self.construct_query(None, size, blank=True, **kwargs)
|
2022-09-27 14:15:08 +00:00
|
|
|
query_created = True
|
2022-09-30 06:22:22 +00:00
|
|
|
for item in tags:
|
|
|
|
for tagname, tagvalue in item.items():
|
|
|
|
add_bool.append({tagname: tagvalue})
|
2022-09-27 14:15:08 +00:00
|
|
|
|
2023-01-15 23:02:13 +00:00
|
|
|
bypass_check = kwargs.get("bypass_check", False)
|
|
|
|
if not bypass_check:
|
|
|
|
valid = self.check_valid_query(query_params, custom_query, **kwargs)
|
|
|
|
if isinstance(valid, dict):
|
|
|
|
return valid
|
2022-09-30 06:22:22 +00:00
|
|
|
|
|
|
|
return search_query
|
|
|
|
|
|
|
|
def check_valid_query(self, query_params, custom_query):
|
2022-09-27 14:15:08 +00:00
|
|
|
required_any = ["query", "tags"]
|
|
|
|
if not any([field in query_params.keys() for field in required_any]):
|
|
|
|
if not custom_query:
|
|
|
|
message = "Empty query!"
|
|
|
|
message_class = "warning"
|
|
|
|
return {"message": message, "class": message_class}
|
|
|
|
|
2023-01-15 17:59:12 +00:00
|
|
|
@abstractmethod
|
|
|
|
def run_query(self, **kwargs):
|
|
|
|
pass
|
2022-09-30 06:22:22 +00:00
|
|
|
|
2022-09-27 14:15:08 +00:00
|
|
|
def filter_blacklisted(self, user, response):
|
|
|
|
"""
|
2022-09-30 06:22:22 +00:00
|
|
|
Low level filter to take the raw search response and remove
|
2022-09-27 14:15:08 +00:00
|
|
|
objects from it we want to keep secret.
|
|
|
|
Does not return, the object is mutated in place.
|
|
|
|
"""
|
|
|
|
response["redacted"] = 0
|
|
|
|
response["exemption"] = None
|
|
|
|
if user.is_superuser:
|
|
|
|
response["exemption"] = True
|
|
|
|
# is_anonymous = isinstance(user, AnonymousUser)
|
|
|
|
# For every hit from ES
|
|
|
|
for index, item in enumerate(list(response["hits"]["hits"])):
|
|
|
|
# For every blacklisted type
|
2022-11-21 07:20:29 +00:00
|
|
|
for blacklisted_type in settings.ELASTICSEARCH_BLACKLISTED.keys():
|
2022-09-27 14:15:08 +00:00
|
|
|
# Check this field we are matching exists
|
|
|
|
if "_source" in item.keys():
|
|
|
|
data_index = "_source"
|
|
|
|
elif "fields" in item.keys():
|
|
|
|
data_index = "fields"
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
if blacklisted_type in item[data_index].keys():
|
|
|
|
content = item[data_index][blacklisted_type]
|
|
|
|
# For every item in the blacklisted array for the type
|
2022-11-23 18:15:42 +00:00
|
|
|
for blacklisted_item in settings.BLACKLISTED[blacklisted_type]:
|
2022-09-27 14:15:08 +00:00
|
|
|
if blacklisted_item == str(content):
|
|
|
|
# Remove the item
|
|
|
|
if item in response["hits"]["hits"]:
|
|
|
|
# Let the UI know something was redacted
|
|
|
|
if (
|
|
|
|
"exemption"
|
|
|
|
not in response["hits"]["hits"][index][data_index]
|
|
|
|
):
|
|
|
|
response["redacted"] += 1
|
|
|
|
# Anonymous
|
|
|
|
if user.is_anonymous:
|
|
|
|
# Just set it to none so the index is not off
|
|
|
|
response["hits"]["hits"][index] = None
|
|
|
|
else:
|
|
|
|
if not user.has_perm("core.bypass_blacklist"):
|
|
|
|
response["hits"]["hits"][index] = None
|
|
|
|
else:
|
|
|
|
response["hits"]["hits"][index][data_index][
|
|
|
|
"exemption"
|
|
|
|
] = True
|
|
|
|
|
|
|
|
# Actually get rid of all the things we set to None
|
|
|
|
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
|
|
|
|
|
2022-11-23 18:15:42 +00:00
|
|
|
def query(self, user, search_query, **kwargs):
|
2022-09-27 14:15:08 +00:00
|
|
|
# For time tracking
|
|
|
|
start = time.process_time()
|
|
|
|
if settings.CACHE:
|
|
|
|
# Sort the keys so the hash is the same
|
|
|
|
query_normalised = orjson.dumps(search_query, option=orjson.OPT_SORT_KEYS)
|
|
|
|
hash = siphash(self.hash_key, query_normalised)
|
|
|
|
cache_hit = r.get(f"query_cache.{user.id}.{hash}")
|
|
|
|
if cache_hit:
|
|
|
|
response = orjson.loads(cache_hit)
|
2022-09-30 06:22:22 +00:00
|
|
|
time_took = (time.process_time() - start) * 1000
|
|
|
|
# Round to 3 significant figures
|
|
|
|
time_took_rounded = round(
|
|
|
|
time_took, 3 - int(floor(log10(abs(time_took)))) - 1
|
|
|
|
)
|
|
|
|
return {
|
|
|
|
"object_list": response,
|
|
|
|
"took": time_took_rounded,
|
|
|
|
"cache": True,
|
|
|
|
}
|
2022-11-23 18:15:42 +00:00
|
|
|
response = self.run_query(user, search_query, **kwargs)
|
|
|
|
|
|
|
|
# For Elasticsearch
|
|
|
|
if isinstance(response, Exception):
|
|
|
|
message = f"Error: {response.info['error']['root_cause'][0]['type']}"
|
|
|
|
message_class = "danger"
|
|
|
|
return {"message": message, "class": message_class}
|
2023-02-09 07:20:28 +00:00
|
|
|
if "took" in response:
|
|
|
|
if response["took"] is None:
|
|
|
|
return None
|
2022-11-23 18:15:42 +00:00
|
|
|
if len(response["hits"]["hits"]) == 0:
|
|
|
|
message = "No results."
|
|
|
|
message_class = "danger"
|
2023-02-09 07:20:28 +00:00
|
|
|
time_took = (time.process_time() - start) * 1000
|
|
|
|
# Round to 3 significant figures
|
|
|
|
time_took_rounded = round(
|
|
|
|
time_took, 3 - int(floor(log10(abs(time_took)))) - 1
|
|
|
|
)
|
|
|
|
return {
|
|
|
|
"message": message,
|
|
|
|
"class": message_class,
|
|
|
|
"took": time_took_rounded,
|
|
|
|
}
|
2022-11-23 18:15:42 +00:00
|
|
|
|
|
|
|
# For Druid
|
2022-09-30 06:22:22 +00:00
|
|
|
if "error" in response:
|
|
|
|
if "errorMessage" in response:
|
|
|
|
context = {
|
|
|
|
"message": response["errorMessage"],
|
|
|
|
"class": "danger",
|
|
|
|
}
|
|
|
|
return context
|
|
|
|
else:
|
|
|
|
return response
|
2022-11-23 18:15:42 +00:00
|
|
|
|
|
|
|
# Removed for now, no point given we have restricted indexes
|
|
|
|
# self.filter_blacklisted(user, response)
|
2022-09-27 14:15:08 +00:00
|
|
|
|
2022-09-30 06:22:22 +00:00
|
|
|
# Parse the response
|
|
|
|
response_parsed = self.parse(response)
|
|
|
|
|
2022-09-27 14:15:08 +00:00
|
|
|
# Write cache
|
|
|
|
if settings.CACHE:
|
2022-09-30 06:22:22 +00:00
|
|
|
to_write_cache = orjson.dumps(response_parsed)
|
2022-09-27 14:15:08 +00:00
|
|
|
r.set(f"query_cache.{user.id}.{hash}", to_write_cache)
|
|
|
|
r.expire(f"query_cache.{user.id}.{hash}", settings.CACHE_TIMEOUT)
|
|
|
|
|
|
|
|
time_took = (time.process_time() - start) * 1000
|
|
|
|
# Round to 3 significant figures
|
|
|
|
time_took_rounded = round(time_took, 3 - int(floor(log10(abs(time_took)))) - 1)
|
|
|
|
return {"object_list": response_parsed, "took": time_took_rounded}
|
|
|
|
|
2022-11-21 19:43:23 +00:00
|
|
|
@abstractmethod
|
2022-09-27 14:15:08 +00:00
|
|
|
def query_results(self, **kwargs):
|
2022-11-21 19:43:23 +00:00
|
|
|
pass
|
2022-09-27 14:15:08 +00:00
|
|
|
|
2022-09-30 06:22:22 +00:00
|
|
|
def process_results(self, response, **kwargs):
|
2022-09-27 14:15:08 +00:00
|
|
|
if kwargs.get("annotate"):
|
2022-09-30 06:22:22 +00:00
|
|
|
annotate_results(response)
|
2022-11-23 18:39:36 +00:00
|
|
|
if kwargs.get("reverse"):
|
2022-11-23 18:52:48 +00:00
|
|
|
response.reverse()
|
2022-09-30 06:22:22 +00:00
|
|
|
if kwargs.get("dedup"):
|
2022-12-09 07:20:28 +00:00
|
|
|
dedup_fields = kwargs.get("dedup_fields")
|
|
|
|
if not dedup_fields:
|
2022-09-30 06:22:22 +00:00
|
|
|
dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
|
2023-01-12 07:20:43 +00:00
|
|
|
response = dedup_list(response, dedup_fields)
|
2022-12-09 07:20:59 +00:00
|
|
|
return response
|
2022-09-27 14:15:08 +00:00
|
|
|
|
2022-11-21 19:43:23 +00:00
|
|
|
@abstractmethod
|
2022-09-27 14:15:08 +00:00
|
|
|
def parse(self, response):
|
2022-11-21 19:43:23 +00:00
|
|
|
pass
|