neptune/core/db/__init__.py

import random
import string
import time
from abc import ABC, abstractmethod
from datetime import datetime
from math import floor, log10

import orjson
from django.conf import settings
from siphashc import siphash

from core import r
from core.db.processing import annotate_results
from core.util import logs
from core.views import helpers


class StorageBackend(ABC):
    def __init__(self, name):
        self.log = logs.get_logger(name)
        self.log.info(f"Initialising storage backend {name}")

        self.initialise_caching()
        self.initialise()

    @abstractmethod
    def initialise(self, **kwargs):
        pass

    def initialise_caching(self):
        hash_key = r.get("cache_hash_key")
        if not hash_key:
            letters = string.ascii_lowercase
            hash_key = "".join(random.choice(letters) for i in range(16))
            self.log.debug(f"Created new hash key: {hash_key}")
            r.set("cache_hash_key", hash_key)
        else:
            hash_key = hash_key.decode("ascii")
            self.log.debug(f"Decoded hash key: {hash_key}")
        self.hash_key = hash_key

    @abstractmethod
    def construct_query(self, **kwargs):
        pass

    @abstractmethod
    def run_query(self, **kwargs):
        pass

    def parse_size(self, query_params, sizes):
        if "size" in query_params:
            size = query_params["size"]
            if size not in sizes:
                message = "Size is not permitted"
                message_class = "danger"
                return {"message": message, "class": message_class}
            size = int(size)
        else:
            size = 15

        return size

    def parse_index(self, user, query_params):
        if "index" in query_params:
            index = query_params["index"]
            if index == "main":
                index = settings.INDEX_MAIN
            else:
                if not user.has_perm(f"core.index_{index}"):
                    message = "Not permitted to search by this index"
                    message_class = "danger"
                    return {
                        "message": message,
                        "class": message_class,
                    }
                if index == "meta":
                    index = settings.INDEX_META
                elif index == "internal":
                    index = settings.INDEX_INT
                elif index == "restricted":
                    if not user.has_perm("core.restricted_sources"):
                        message = "Not permitted to search by this index"
                        message_class = "danger"
                        return {
                            "message": message,
                            "class": message_class,
                        }
                    index = settings.INDEX_RESTRICTED
                else:
                    message = "Index is not valid."
                    message_class = "danger"
                    return {
                        "message": message,
                        "class": message_class,
                    }
        else:
            index = settings.INDEX_MAIN
        return index

    def parse_query(self, query_params, tags, size, index, custom_query, add_bool):
        query_created = False
        if "query" in query_params:
            query = query_params["query"]
            search_query = self.construct_query(query, size, index)
            query_created = True
        else:
            if custom_query:
                search_query = custom_query
            else:
                search_query = self.construct_query(None, size, index, blank=True)

        if tags:
            # Get a blank search query
            if not query_created:
                search_query = self.construct_query(None, size, index, blank=True)
                query_created = True
            for item in tags:
                for tagname, tagvalue in item.items():
                    add_bool.append({tagname: tagvalue})

        valid = self.check_valid_query(query_params, custom_query)
        if isinstance(valid, dict):
            return valid

        return search_query

    def check_valid_query(self, query_params, custom_query):
        required_any = ["query", "tags"]
        if not any([field in query_params.keys() for field in required_any]):
            if not custom_query:
                message = "Empty query!"
                message_class = "warning"
                return {"message": message, "class": message_class}

    def parse_source(self, user, query_params):
        if "source" in query_params:
            source = query_params["source"]

            if source in settings.SOURCES_RESTRICTED:
                if not user.has_perm("core.restricted_sources"):
                    message = "Access denied"
                    message_class = "danger"
                    return {"message": message, "class": message_class}
            elif source not in settings.MAIN_SOURCES:
                message = "Invalid source"
                message_class = "danger"
                return {"message": message, "class": message_class}

            if source == "all":
                source = None  # the next block will populate it

        if source:
            sources = [source]
        else:
            sources = list(settings.MAIN_SOURCES)
            if user.has_perm("core.restricted_sources"):
                for source_iter in settings.SOURCES_RESTRICTED:
                    sources.append(source_iter)

        if "all" in sources:
            sources.remove("all")

        return sources

    def parse_sort(self, query_params):
        sort = None
        if "sorting" in query_params:
            sorting = query_params["sorting"]
            if sorting not in ("asc", "desc", "none"):
                message = "Invalid sort"
                message_class = "danger"
                return {"message": message, "class": message_class}
            if sorting == "asc":
                sort = "ascending"
            elif sorting == "desc":
                sort = "descending"
        return sort

    def parse_date_time(self, query_params):
        if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
            query_params.keys()
        ):
            from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
            to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
            from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")
            to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")

            return (from_ts, to_ts)
        return (None, None)

    def parse_sentiment(self, query_params):
        sentiment = None
        if "check_sentiment" in query_params:
            if "sentiment_method" not in query_params:
                message = "No sentiment method"
                message_class = "danger"
                return {"message": message, "class": message_class}
            if "sentiment" in query_params:
                sentiment = query_params["sentiment"]
                try:
                    sentiment = float(sentiment)
                except ValueError:
                    message = "Sentiment is not a float"
                    message_class = "danger"
                    return {"message": message, "class": message_class}
            sentiment_method = query_params["sentiment_method"]

            return (sentiment_method, sentiment)

    def filter_blacklisted(self, user, response):
        """
        Low level filter to take the raw search response and remove
        objects from it we want to keep secret.
        Does not return, the object is mutated in place.
        """
        response["redacted"] = 0
        response["exemption"] = None
        if user.is_superuser:
            response["exemption"] = True
        # is_anonymous = isinstance(user, AnonymousUser)
        # For every hit from ES
        for index, item in enumerate(list(response["hits"]["hits"])):
            # For every blacklisted type
            for blacklisted_type in settings.ELASTICSEARCH_BLACKLISTED.keys():
                # Check this field we are matching exists
                if "_source" in item.keys():
                    data_index = "_source"
                elif "fields" in item.keys():
                    data_index = "fields"
                else:
                    return False
                if blacklisted_type in item[data_index].keys():
                    content = item[data_index][blacklisted_type]
                    # For every item in the blacklisted array for the type
                    for blacklisted_item in settings.ELASTICSEARCH_BLACKLISTED[
                        blacklisted_type
                    ]:
                        if blacklisted_item == str(content):
                            # Remove the item
                            if item in response["hits"]["hits"]:
                                # Let the UI know something was redacted
                                if (
                                    "exemption"
                                    not in response["hits"]["hits"][index][data_index]
                                ):
                                    response["redacted"] += 1
                                # Anonymous
                                if user.is_anonymous:
                                    # Just set it to none so the index is not off
                                    response["hits"]["hits"][index] = None
                                else:
                                    if not user.has_perm("core.bypass_blacklist"):
                                        response["hits"]["hits"][index] = None
                                    else:
                                        response["hits"]["hits"][index][data_index][
                                            "exemption"
                                        ] = True

        # Actually get rid of all the things we set to None
        response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]

    def query(self, user, search_query):
        # For time tracking
        start = time.process_time()
        if settings.CACHE:
            # Sort the keys so the hash is the same
            query_normalised = orjson.dumps(search_query, option=orjson.OPT_SORT_KEYS)
            hash = siphash(self.hash_key, query_normalised)
            cache_hit = r.get(f"query_cache.{user.id}.{hash}")
            if cache_hit:
                response = orjson.loads(cache_hit)
                print("CACHE HIT", response)

                time_took = (time.process_time() - start) * 1000
                # Round to 3 significant figures
                time_took_rounded = round(
                    time_took, 3 - int(floor(log10(abs(time_took)))) - 1
                )
                return {
                    "object_list": response,
                    "took": time_took_rounded,
                    "cache": True,
                }
        response = self.run_query(user, search_query)
        if "error" in response:
            if "errorMessage" in response:
                context = {
                    "message": response["errorMessage"],
                    "class": "danger",
                }
                return context
            else:
                return response
        # response = response.to_dict()
        # print("RESP", response)
        if "took" in response:
            if response["took"] is None:
                return None
        self.filter_blacklisted(user, response)

        # Parse the response
        response_parsed = self.parse(response)

        # Write cache
        if settings.CACHE:
            to_write_cache = orjson.dumps(response_parsed)
            r.set(f"query_cache.{user.id}.{hash}", to_write_cache)
            r.expire(f"query_cache.{user.id}.{hash}", settings.CACHE_TIMEOUT)

        time_took = (time.process_time() - start) * 1000
        # Round to 3 significant figures
        time_took_rounded = round(time_took, 3 - int(floor(log10(abs(time_took)))) - 1)
        return {"object_list": response_parsed, "took": time_took_rounded}

    @abstractmethod
    def query_results(self, **kwargs):
        pass

    def process_results(self, response, **kwargs):
        if kwargs.get("annotate"):
            annotate_results(response)
        if kwargs.get("dedup"):
            response = response[::-1]
        if kwargs.get("dedup"):
            if not kwargs.get("dedup_fields"):
                dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
            response = helpers.dedup_list(response, dedup_fields)

    @abstractmethod
    def parse(self, response):
        pass
Begin implementing DB framework 2 years ago			`import random`
			`import string`
			`import time`
Begin refactoring Elastic backend to use helper functions 1 year ago			`from abc import ABC, abstractmethod`
Implement Druid DB fetching 2 years ago			`from datetime import datetime`
Begin implementing DB framework 2 years ago			`from math import floor, log10`

			`import orjson`
			`from django.conf import settings`
			`from siphashc import siphash`

			`from core import r`
			`from core.db.processing import annotate_results`
			`from core.util import logs`
Implement Druid DB fetching 2 years ago			`from core.views import helpers`
Begin implementing DB framework 2 years ago

Begin refactoring Elastic backend to use helper functions 1 year ago			`class StorageBackend(ABC):`
Begin implementing DB framework 2 years ago			`def __init__(self, name):`
			`self.log = logs.get_logger(name)`
			`self.log.info(f"Initialising storage backend {name}")`

			`self.initialise_caching()`
			`self.initialise()`

Begin refactoring Elastic backend to use helper functions 1 year ago			`@abstractmethod`
Begin implementing DB framework 2 years ago			`def initialise(self, **kwargs):`
Begin refactoring Elastic backend to use helper functions 1 year ago			`pass`
Begin implementing DB framework 2 years ago
			`def initialise_caching(self):`
			`hash_key = r.get("cache_hash_key")`
			`if not hash_key:`
			`letters = string.ascii_lowercase`
			`hash_key = "".join(random.choice(letters) for i in range(16))`
			`self.log.debug(f"Created new hash key: {hash_key}")`
			`r.set("cache_hash_key", hash_key)`
			`else:`
			`hash_key = hash_key.decode("ascii")`
			`self.log.debug(f"Decoded hash key: {hash_key}")`
			`self.hash_key = hash_key`

Begin refactoring Elastic backend to use helper functions 1 year ago			`@abstractmethod`
Begin implementing DB framework 2 years ago			`def construct_query(self, **kwargs):`
Begin refactoring Elastic backend to use helper functions 1 year ago			`pass`
Begin implementing DB framework 2 years ago
Begin refactoring Elastic backend to use helper functions 1 year ago			`@abstractmethod`
Begin implementing DB framework 2 years ago			`def run_query(self, **kwargs):`
Begin refactoring Elastic backend to use helper functions 1 year ago			`pass`
Begin implementing DB framework 2 years ago
			`def parse_size(self, query_params, sizes):`
			`if "size" in query_params:`
			`size = query_params["size"]`
			`if size not in sizes:`
			`message = "Size is not permitted"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
			`size = int(size)`
			`else:`
			`size = 15`

			`return size`

			`def parse_index(self, user, query_params):`
			`if "index" in query_params:`
			`index = query_params["index"]`
			`if index == "main":`
			`index = settings.INDEX_MAIN`
			`else:`
			`if not user.has_perm(f"core.index_{index}"):`
			`message = "Not permitted to search by this index"`
			`message_class = "danger"`
			`return {`
			`"message": message,`
			`"class": message_class,`
			`}`
			`if index == "meta":`
			`index = settings.INDEX_META`
			`elif index == "internal":`
			`index = settings.INDEX_INT`
Implement Druid DB fetching 2 years ago			`elif index == "restricted":`
			`if not user.has_perm("core.restricted_sources"):`
			`message = "Not permitted to search by this index"`
			`message_class = "danger"`
			`return {`
			`"message": message,`
			`"class": message_class,`
			`}`
			`index = settings.INDEX_RESTRICTED`
Begin implementing DB framework 2 years ago			`else:`
			`message = "Index is not valid."`
			`message_class = "danger"`
			`return {`
			`"message": message,`
			`"class": message_class,`
			`}`
			`else:`
			`index = settings.INDEX_MAIN`
			`return index`

			`def parse_query(self, query_params, tags, size, index, custom_query, add_bool):`
Implement Druid DB fetching 2 years ago			`query_created = False`
Begin implementing DB framework 2 years ago			`if "query" in query_params:`
			`query = query_params["query"]`
			`search_query = self.construct_query(query, size, index)`
			`query_created = True`
			`else:`
			`if custom_query:`
			`search_query = custom_query`
Implement Druid DB fetching 2 years ago			`else:`
			`search_query = self.construct_query(None, size, index, blank=True)`
Begin implementing DB framework 2 years ago
			`if tags:`
			`# Get a blank search query`
			`if not query_created:`
			`search_query = self.construct_query(None, size, index, blank=True)`
			`query_created = True`
Allow duplicate tag keys 2 years ago			`for item in tags:`
			`for tagname, tagvalue in item.items():`
			`add_bool.append({tagname: tagvalue})`
Begin implementing DB framework 2 years ago
Implement Druid DB fetching 2 years ago			`valid = self.check_valid_query(query_params, custom_query)`
			`if isinstance(valid, dict):`
			`return valid`

			`return search_query`

			`def check_valid_query(self, query_params, custom_query):`
Begin implementing DB framework 2 years ago			`required_any = ["query", "tags"]`
			`if not any([field in query_params.keys() for field in required_any]):`
			`if not custom_query:`
			`message = "Empty query!"`
			`message_class = "warning"`
			`return {"message": message, "class": message_class}`

			`def parse_source(self, user, query_params):`
			`if "source" in query_params:`
			`source = query_params["source"]`

			`if source in settings.SOURCES_RESTRICTED:`
			`if not user.has_perm("core.restricted_sources"):`
			`message = "Access denied"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
			`elif source not in settings.MAIN_SOURCES:`
			`message = "Invalid source"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`

			`if source == "all":`
			`source = None # the next block will populate it`

			`if source:`
			`sources = [source]`
			`else:`
			`sources = list(settings.MAIN_SOURCES)`
			`if user.has_perm("core.restricted_sources"):`
			`for source_iter in settings.SOURCES_RESTRICTED:`
			`sources.append(source_iter)`

Implement Druid DB fetching 2 years ago			`if "all" in sources:`
			`sources.remove("all")`

Begin implementing DB framework 2 years ago			`return sources`

Implement Druid DB fetching 2 years ago			`def parse_sort(self, query_params):`
			`sort = None`
			`if "sorting" in query_params:`
			`sorting = query_params["sorting"]`
			`if sorting not in ("asc", "desc", "none"):`
			`message = "Invalid sort"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
			`if sorting == "asc":`
			`sort = "ascending"`
			`elif sorting == "desc":`
			`sort = "descending"`
			`return sort`

			`def parse_date_time(self, query_params):`
			`if set({"from_date", "to_date", "from_time", "to_time"}).issubset(`
			`query_params.keys()`
			`):`
			`from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"`
			`to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"`
			`from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")`
			`to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")`

			`return (from_ts, to_ts)`
			`return (None, None)`

			`def parse_sentiment(self, query_params):`
			`sentiment = None`
			`if "check_sentiment" in query_params:`
			`if "sentiment_method" not in query_params:`
			`message = "No sentiment method"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
			`if "sentiment" in query_params:`
			`sentiment = query_params["sentiment"]`
			`try:`
			`sentiment = float(sentiment)`
			`except ValueError:`
			`message = "Sentiment is not a float"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
			`sentiment_method = query_params["sentiment_method"]`

			`return (sentiment_method, sentiment)`

Begin implementing DB framework 2 years ago			`def filter_blacklisted(self, user, response):`
			`"""`
Implement Druid DB fetching 2 years ago			`Low level filter to take the raw search response and remove`
Begin implementing DB framework 2 years ago			`objects from it we want to keep secret.`
			`Does not return, the object is mutated in place.`
			`"""`
			`response["redacted"] = 0`
			`response["exemption"] = None`
			`if user.is_superuser:`
			`response["exemption"] = True`
			`# is_anonymous = isinstance(user, AnonymousUser)`
			`# For every hit from ES`
			`for index, item in enumerate(list(response["hits"]["hits"])):`
			`# For every blacklisted type`
Replace OpenSearch with Elasticsearch 1 year ago			`for blacklisted_type in settings.ELASTICSEARCH_BLACKLISTED.keys():`
Begin implementing DB framework 2 years ago			`# Check this field we are matching exists`
			`if "_source" in item.keys():`
			`data_index = "_source"`
			`elif "fields" in item.keys():`
			`data_index = "fields"`
			`else:`
			`return False`
			`if blacklisted_type in item[data_index].keys():`
			`content = item[data_index][blacklisted_type]`
			`# For every item in the blacklisted array for the type`
Replace OpenSearch with Elasticsearch 1 year ago			`for blacklisted_item in settings.ELASTICSEARCH_BLACKLISTED[`
Begin implementing DB framework 2 years ago			`blacklisted_type`
			`]:`
			`if blacklisted_item == str(content):`
			`# Remove the item`
			`if item in response["hits"]["hits"]:`
			`# Let the UI know something was redacted`
			`if (`
			`"exemption"`
			`not in response["hits"]["hits"][index][data_index]`
			`):`
			`response["redacted"] += 1`
			`# Anonymous`
			`if user.is_anonymous:`
			`# Just set it to none so the index is not off`
			`response["hits"]["hits"][index] = None`
			`else:`
			`if not user.has_perm("core.bypass_blacklist"):`
			`response["hits"]["hits"][index] = None`
			`else:`
			`response["hits"]["hits"][index][data_index][`
			`"exemption"`
			`] = True`

			`# Actually get rid of all the things we set to None`
			`response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]`

			`def query(self, user, search_query):`
			`# For time tracking`
			`start = time.process_time()`
			`if settings.CACHE:`
			`# Sort the keys so the hash is the same`
			`query_normalised = orjson.dumps(search_query, option=orjson.OPT_SORT_KEYS)`
			`hash = siphash(self.hash_key, query_normalised)`
			`cache_hit = r.get(f"query_cache.{user.id}.{hash}")`
			`if cache_hit:`
			`response = orjson.loads(cache_hit)`
Implement Druid DB fetching 2 years ago			`print("CACHE HIT", response)`

			`time_took = (time.process_time() - start) * 1000`
			`# Round to 3 significant figures`
			`time_took_rounded = round(`
			`time_took, 3 - int(floor(log10(abs(time_took)))) - 1`
			`)`
			`return {`
			`"object_list": response,`
			`"took": time_took_rounded,`
			`"cache": True,`
			`}`
Begin implementing DB framework 2 years ago			`response = self.run_query(user, search_query)`
Implement Druid DB fetching 2 years ago			`if "error" in response:`
			`if "errorMessage" in response:`
			`context = {`
			`"message": response["errorMessage"],`
			`"class": "danger",`
			`}`
			`return context`
			`else:`
			`return response`
Begin implementing DB framework 2 years ago			`# response = response.to_dict()`
			`# print("RESP", response)`
			`if "took" in response:`
			`if response["took"] is None:`
			`return None`
			`self.filter_blacklisted(user, response)`

Implement Druid DB fetching 2 years ago			`# Parse the response`
			`response_parsed = self.parse(response)`

Begin implementing DB framework 2 years ago			`# Write cache`
			`if settings.CACHE:`
Implement Druid DB fetching 2 years ago			`to_write_cache = orjson.dumps(response_parsed)`
Begin implementing DB framework 2 years ago			`r.set(f"query_cache.{user.id}.{hash}", to_write_cache)`
			`r.expire(f"query_cache.{user.id}.{hash}", settings.CACHE_TIMEOUT)`

			`time_took = (time.process_time() - start) * 1000`
			`# Round to 3 significant figures`
			`time_took_rounded = round(time_took, 3 - int(floor(log10(abs(time_took)))) - 1)`
			`return {"object_list": response_parsed, "took": time_took_rounded}`

Begin refactoring Elastic backend to use helper functions 1 year ago			`@abstractmethod`
Begin implementing DB framework 2 years ago			`def query_results(self, **kwargs):`
Begin refactoring Elastic backend to use helper functions 1 year ago			`pass`
Begin implementing DB framework 2 years ago
Implement Druid DB fetching 2 years ago			`def process_results(self, response, **kwargs):`
Begin implementing DB framework 2 years ago			`if kwargs.get("annotate"):`
Implement Druid DB fetching 2 years ago			`annotate_results(response)`
			`if kwargs.get("dedup"):`
			`response = response[::-1]`
			`if kwargs.get("dedup"):`
			`if not kwargs.get("dedup_fields"):`
			`dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]`
			`response = helpers.dedup_list(response, dedup_fields)`
Begin implementing DB framework 2 years ago
Begin refactoring Elastic backend to use helper functions 1 year ago			`@abstractmethod`
Begin implementing DB framework 2 years ago			`def parse(self, response):`
Begin refactoring Elastic backend to use helper functions 1 year ago			`pass`