neptune/core/db/__init__.py

import random
import string
import time
from abc import ABC, abstractmethod
from math import floor, log10

import orjson
from django.conf import settings
from siphashc import siphash

from core import r
from core.db.processing import annotate_results
from core.util import logs


def remove_defaults(query_params):
    for field, value in list(query_params.items()):
        if field in settings.DRILLDOWN_DEFAULT_PARAMS:
            if value == settings.DRILLDOWN_DEFAULT_PARAMS[field]:
                del query_params[field]


def add_defaults(query_params):
    for field, value in settings.DRILLDOWN_DEFAULT_PARAMS.items():
        if field not in query_params:
            query_params[field] = value


def dedup_list(data, check_keys):
    """
    Remove duplicate dictionaries from list.
    """
    seen = set()
    out = []

    dup_count = 0
    for x in data:
        dedupeKey = tuple(x[k] for k in check_keys if k in x)
        if dedupeKey in seen:
            dup_count += 1
            continue
        if dup_count > 0:
            out.append({"type": "control", "hidden": dup_count})
            dup_count = 0
        out.append(x)
        seen.add(dedupeKey)
    if dup_count > 0:
        out.append({"type": "control", "hidden": dup_count})
    return out


class StorageBackend(ABC):
    def __init__(self, name):
        self.log = logs.get_logger(name)
        self.log.info(f"Initialising storage backend {name}")

        self.initialise_caching()
        # self.initialise()

    @abstractmethod
    def initialise(self, **kwargs):
        pass

    def initialise_caching(self):
        hash_key = r.get("cache_hash_key")
        if not hash_key:
            letters = string.ascii_lowercase
            hash_key = "".join(random.choice(letters) for i in range(16))
            self.log.debug(f"Created new hash key: {hash_key}")
            r.set("cache_hash_key", hash_key)
        else:
            hash_key = hash_key.decode("ascii")
            self.log.debug(f"Decoded hash key: {hash_key}")
        self.hash_key = hash_key

    @abstractmethod
    def construct_query(self, **kwargs):
        pass

    def parse_query(self, query_params, tags, size, custom_query, add_bool, **kwargs):
        query_created = False
        if "query" in query_params:
            query = query_params["query"]
            search_query = self.construct_query(query, size, **kwargs)
            query_created = True
        else:
            if custom_query:
                search_query = custom_query
            else:
                search_query = self.construct_query(None, size, blank=True, **kwargs)

        if tags:
            # Get a blank search query
            if not query_created:
                search_query = self.construct_query(None, size, blank=True, **kwargs)
                query_created = True
            for item in tags:
                for tagname, tagvalue in item.items():
                    add_bool.append({tagname: tagvalue})

        bypass_check = kwargs.get("bypass_check", False)
        if not bypass_check:
            valid = self.check_valid_query(query_params, custom_query, **kwargs)
            if isinstance(valid, dict):
                return valid

        return search_query

    def check_valid_query(self, query_params, custom_query):
        required_any = ["query", "tags"]
        if not any([field in query_params.keys() for field in required_any]):
            if not custom_query:
                message = "Empty query!"
                message_class = "warning"
                return {"message": message, "class": message_class}

    @abstractmethod
    def run_query(self, **kwargs):
        pass

    def filter_blacklisted(self, user, response):
        """
        Low level filter to take the raw search response and remove
        objects from it we want to keep secret.
        Does not return, the object is mutated in place.
        """
        response["redacted"] = 0
        response["exemption"] = None
        if user.is_superuser:
            response["exemption"] = True
        # is_anonymous = isinstance(user, AnonymousUser)
        # For every hit from ES
        for index, item in enumerate(list(response["hits"]["hits"])):
            # For every blacklisted type
            for blacklisted_type in settings.ELASTICSEARCH_BLACKLISTED.keys():
                # Check this field we are matching exists
                if "_source" in item.keys():
                    data_index = "_source"
                elif "fields" in item.keys():
                    data_index = "fields"
                else:
                    return False
                if blacklisted_type in item[data_index].keys():
                    content = item[data_index][blacklisted_type]
                    # For every item in the blacklisted array for the type
                    for blacklisted_item in settings.BLACKLISTED[blacklisted_type]:
                        if blacklisted_item == str(content):
                            # Remove the item
                            if item in response["hits"]["hits"]:
                                # Let the UI know something was redacted
                                if (
                                    "exemption"
                                    not in response["hits"]["hits"][index][data_index]
                                ):
                                    response["redacted"] += 1
                                # Anonymous
                                if user.is_anonymous:
                                    # Just set it to none so the index is not off
                                    response["hits"]["hits"][index] = None
                                else:
                                    if not user.has_perm("core.bypass_blacklist"):
                                        response["hits"]["hits"][index] = None
                                    else:
                                        response["hits"]["hits"][index][data_index][
                                            "exemption"
                                        ] = True

        # Actually get rid of all the things we set to None
        response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]

    def query(self, user, search_query, **kwargs):
        # For time tracking
        start = time.process_time()
        if settings.CACHE:
            # Sort the keys so the hash is the same
            query_normalised = orjson.dumps(search_query, option=orjson.OPT_SORT_KEYS)
            hash = siphash(self.hash_key, query_normalised)
            cache_hit = r.get(f"query_cache.{user.id}.{hash}")
            if cache_hit:
                response = orjson.loads(cache_hit)
                time_took = (time.process_time() - start) * 1000
                # Round to 3 significant figures
                time_took_rounded = round(
                    time_took, 3 - int(floor(log10(abs(time_took)))) - 1
                )
                return {
                    "object_list": response,
                    "took": time_took_rounded,
                    "cache": True,
                }
        response = self.run_query(user, search_query, **kwargs)

        # For Elasticsearch
        if isinstance(response, Exception):
            message = f"Error: {response.info['error']['root_cause'][0]['type']}"
            message_class = "danger"
            return {"message": message, "class": message_class}
        if "took" in response:
            if response["took"] is None:
                return None
        if len(response["hits"]["hits"]) == 0:
            message = "No results."
            message_class = "danger"
            time_took = (time.process_time() - start) * 1000
            # Round to 3 significant figures
            time_took_rounded = round(
                time_took, 3 - int(floor(log10(abs(time_took)))) - 1
            )
            return {
                "message": message,
                "class": message_class,
                "took": time_took_rounded,
            }

        # For Druid
        if "error" in response:
            if "errorMessage" in response:
                context = {
                    "message": response["errorMessage"],
                    "class": "danger",
                }
                return context
            else:
                return response

        # Removed for now, no point given we have restricted indexes
        # self.filter_blacklisted(user, response)

        # Parse the response
        response_parsed = self.parse(response)

        # Write cache
        if settings.CACHE:
            to_write_cache = orjson.dumps(response_parsed)
            r.set(f"query_cache.{user.id}.{hash}", to_write_cache)
            r.expire(f"query_cache.{user.id}.{hash}", settings.CACHE_TIMEOUT)

        time_took = (time.process_time() - start) * 1000
        # Round to 3 significant figures
        time_took_rounded = round(time_took, 3 - int(floor(log10(abs(time_took)))) - 1)
        return {"object_list": response_parsed, "took": time_took_rounded}

    @abstractmethod
    def query_results(self, **kwargs):
        pass

    def process_results(self, response, **kwargs):
        if kwargs.get("annotate"):
            annotate_results(response)
        if kwargs.get("reverse"):
            response.reverse()
        if kwargs.get("dedup"):
            dedup_fields = kwargs.get("dedup_fields")
            if not dedup_fields:
                dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
            response = dedup_list(response, dedup_fields)
        return response

    @abstractmethod
    def parse(self, response):
        pass
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`import random`
			`import string`
			`import time`
Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`from abc import ABC, abstractmethod`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`from math import floor, log10`

			`import orjson`
			`from django.conf import settings`
			`from siphashc import siphash`

			`from core import r`
			`from core.db.processing import annotate_results`
			`from core.util import logs`
Implement notification rules and settings 2023-01-12 07:20:43 +00:00

			`def remove_defaults(query_params):`
			`for field, value in list(query_params.items()):`
			`if field in settings.DRILLDOWN_DEFAULT_PARAMS:`
			`if value == settings.DRILLDOWN_DEFAULT_PARAMS[field]:`
			`del query_params[field]`


			`def add_defaults(query_params):`
			`for field, value in settings.DRILLDOWN_DEFAULT_PARAMS.items():`
			`if field not in query_params:`
			`query_params[field] = value`


			`def dedup_list(data, check_keys):`
			`"""`
			`Remove duplicate dictionaries from list.`
			`"""`
			`seen = set()`
			`out = []`

			`dup_count = 0`
			`for x in data:`
			`dedupeKey = tuple(x[k] for k in check_keys if k in x)`
			`if dedupeKey in seen:`
			`dup_count += 1`
			`continue`
			`if dup_count > 0:`
			`out.append({"type": "control", "hidden": dup_count})`
			`dup_count = 0`
			`out.append(x)`
			`seen.add(dedupeKey)`
			`if dup_count > 0:`
			`out.append({"type": "control", "hidden": dup_count})`
			`return out`


Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`class StorageBackend(ABC):`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`def __init__(self, name):`
			`self.log = logs.get_logger(name)`
			`self.log.info(f"Initialising storage backend {name}")`

			`self.initialise_caching()`
Initialise ES client only on first search 2023-01-16 07:20:37 +00:00			`# self.initialise()`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`@abstractmethod`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`def initialise(self, **kwargs):`
Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`pass`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
			`def initialise_caching(self):`
			`hash_key = r.get("cache_hash_key")`
			`if not hash_key:`
			`letters = string.ascii_lowercase`
			`hash_key = "".join(random.choice(letters) for i in range(16))`
			`self.log.debug(f"Created new hash key: {hash_key}")`
			`r.set("cache_hash_key", hash_key)`
			`else:`
			`hash_key = hash_key.decode("ascii")`
			`self.log.debug(f"Decoded hash key: {hash_key}")`
			`self.hash_key = hash_key`

Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`@abstractmethod`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`def construct_query(self, **kwargs):`
Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`pass`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`def parse_query(self, query_params, tags, size, custom_query, add_bool, **kwargs):`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`query_created = False`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`if "query" in query_params:`
			`query = query_params["query"]`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`search_query = self.construct_query(query, size, **kwargs)`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`query_created = True`
			`else:`
			`if custom_query:`
			`search_query = custom_query`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`else:`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`search_query = self.construct_query(None, size, blank=True, **kwargs)`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
			`if tags:`
			`# Get a blank search query`
			`if not query_created:`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`search_query = self.construct_query(None, size, blank=True, **kwargs)`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`query_created = True`
Allow duplicate tag keys 2022-09-30 06:22:22 +00:00			`for item in tags:`
			`for tagname, tagvalue in item.items():`
			`add_bool.append({tagname: tagvalue})`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
Finish implementing webhook delivery 2023-01-15 23:02:13 +00:00			`bypass_check = kwargs.get("bypass_check", False)`
			`if not bypass_check:`
			`valid = self.check_valid_query(query_params, custom_query, **kwargs)`
			`if isinstance(valid, dict):`
			`return valid`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00
			`return search_query`

			`def check_valid_query(self, query_params, custom_query):`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`required_any = ["query", "tags"]`
			`if not any([field in query_params.keys() for field in required_any]):`
			`if not custom_query:`
			`message = "Empty query!"`
			`message_class = "warning"`
			`return {"message": message, "class": message_class}`

Implement running scheduled rules and check aggregations 2023-01-15 17:59:12 +00:00			`@abstractmethod`
			`def run_query(self, **kwargs):`
			`pass`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`def filter_blacklisted(self, user, response):`
			`"""`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`Low level filter to take the raw search response and remove`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`objects from it we want to keep secret.`
			`Does not return, the object is mutated in place.`
			`"""`
			`response["redacted"] = 0`
			`response["exemption"] = None`
			`if user.is_superuser:`
			`response["exemption"] = True`
			`# is_anonymous = isinstance(user, AnonymousUser)`
			`# For every hit from ES`
			`for index, item in enumerate(list(response["hits"]["hits"])):`
			`# For every blacklisted type`
Replace OpenSearch with Elasticsearch 2022-11-21 07:20:29 +00:00			`for blacklisted_type in settings.ELASTICSEARCH_BLACKLISTED.keys():`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`# Check this field we are matching exists`
			`if "_source" in item.keys():`
			`data_index = "_source"`
			`elif "fields" in item.keys():`
			`data_index = "fields"`
			`else:`
			`return False`
			`if blacklisted_type in item[data_index].keys():`
			`content = item[data_index][blacklisted_type]`
			`# For every item in the blacklisted array for the type`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`for blacklisted_item in settings.BLACKLISTED[blacklisted_type]:`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`if blacklisted_item == str(content):`
			`# Remove the item`
			`if item in response["hits"]["hits"]:`
			`# Let the UI know something was redacted`
			`if (`
			`"exemption"`
			`not in response["hits"]["hits"][index][data_index]`
			`):`
			`response["redacted"] += 1`
			`# Anonymous`
			`if user.is_anonymous:`
			`# Just set it to none so the index is not off`
			`response["hits"]["hits"][index] = None`
			`else:`
			`if not user.has_perm("core.bypass_blacklist"):`
			`response["hits"]["hits"][index] = None`
			`else:`
			`response["hits"]["hits"][index][data_index][`
			`"exemption"`
			`] = True`

			`# Actually get rid of all the things we set to None`
			`response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]`

Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`def query(self, user, search_query, **kwargs):`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`# For time tracking`
			`start = time.process_time()`
			`if settings.CACHE:`
			`# Sort the keys so the hash is the same`
			`query_normalised = orjson.dumps(search_query, option=orjson.OPT_SORT_KEYS)`
			`hash = siphash(self.hash_key, query_normalised)`
			`cache_hit = r.get(f"query_cache.{user.id}.{hash}")`
			`if cache_hit:`
			`response = orjson.loads(cache_hit)`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`time_took = (time.process_time() - start) * 1000`
			`# Round to 3 significant figures`
			`time_took_rounded = round(`
			`time_took, 3 - int(floor(log10(abs(time_took)))) - 1`
			`)`
			`return {`
			`"object_list": response,`
			`"took": time_took_rounded,`
			`"cache": True,`
			`}`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`response = self.run_query(user, search_query, **kwargs)`

			`# For Elasticsearch`
			`if isinstance(response, Exception):`
			`message = f"Error: {response.info['error']['root_cause'][0]['type']}"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
Add the time taken even where there are no hits 2023-02-09 07:20:28 +00:00			`if "took" in response:`
			`if response["took"] is None:`
			`return None`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00			`if len(response["hits"]["hits"]) == 0:`
			`message = "No results."`
			`message_class = "danger"`
Add the time taken even where there are no hits 2023-02-09 07:20:28 +00:00			`time_took = (time.process_time() - start) * 1000`
			`# Round to 3 significant figures`
			`time_took_rounded = round(`
			`time_took, 3 - int(floor(log10(abs(time_took)))) - 1`
			`)`
			`return {`
			`"message": message,`
			`"class": message_class,`
			`"took": time_took_rounded,`
			`}`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00
			`# For Druid`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`if "error" in response:`
			`if "errorMessage" in response:`
			`context = {`
			`"message": response["errorMessage"],`
			`"class": "danger",`
			`}`
			`return context`
			`else:`
			`return response`
Finish reimplementing Elasticsearch 2022-11-23 18:15:42 +00:00
			`# Removed for now, no point given we have restricted indexes`
			`# self.filter_blacklisted(user, response)`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`# Parse the response`
			`response_parsed = self.parse(response)`

Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`# Write cache`
			`if settings.CACHE:`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`to_write_cache = orjson.dumps(response_parsed)`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`r.set(f"query_cache.{user.id}.{hash}", to_write_cache)`
			`r.expire(f"query_cache.{user.id}.{hash}", settings.CACHE_TIMEOUT)`

			`time_took = (time.process_time() - start) * 1000`
			`# Round to 3 significant figures`
			`time_took_rounded = round(time_took, 3 - int(floor(log10(abs(time_took)))) - 1)`
			`return {"object_list": response_parsed, "took": time_took_rounded}`

Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`@abstractmethod`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`def query_results(self, **kwargs):`
Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`pass`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`def process_results(self, response, **kwargs):`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`if kwargs.get("annotate"):`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`annotate_results(response)`
Fix annotating results and remove debugging code 2022-11-23 18:39:36 +00:00			`if kwargs.get("reverse"):`
Mutate the response when reversing 2022-11-23 18:52:48 +00:00			`response.reverse()`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`if kwargs.get("dedup"):`
Fix dedup 2022-12-09 07:20:28 +00:00			`dedup_fields = kwargs.get("dedup_fields")`
			`if not dedup_fields:`
Implement Druid DB fetching 2022-09-30 06:22:22 +00:00			`dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]`
Implement notification rules and settings 2023-01-12 07:20:43 +00:00			`response = dedup_list(response, dedup_fields)`
Fix deduplication function 2022-12-09 07:20:59 +00:00			`return response`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00
Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`@abstractmethod`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00			`def parse(self, response):`
Begin refactoring Elastic backend to use helper functions 2022-11-21 19:43:23 +00:00			`pass`