Implement running scheduled rules and check aggregations

2023-01-15 17:59:12 +00:00
parent 435d9b5571
commit 6bfa0aa73b
15 changed files with 600 additions and 258 deletions
--- a/core/db/elastic.py
+++ b/core/db/elastic.py
@@ -10,6 +10,15 @@ from core.db import StorageBackend, add_defaults
 # from json import dumps
 # pp = lambda x: print(dumps(x, indent=2))
 from core.db.processing import parse_results
+from core.lib.parsing import (
+    QueryError,
+    parse_date_time,
+    parse_index,
+    parse_sentiment,
+    parse_size,
+    parse_sort,
+    parse_source,
+)


 class ElasticsearchBackend(StorageBackend):
@@ -126,14 +135,16 @@ class ElasticsearchBackend(StorageBackend):
            )
        return query

-    def construct_query(self, query, size, blank=False):
+    def construct_query(self, query, size=None, blank=False):
        """
        Accept some query parameters and construct an Elasticsearch query.
        """
        query_base = {
-            "size": size,
+            # "size": size,
            "query": {"bool": {"must": []}},
        }
+        if size:
+            query_base["size"] = size
        query_string = {
            "query_string": {
                "query": query,
@@ -163,8 +174,8 @@ class ElasticsearchBackend(StorageBackend):
            query_base["query"]["bool"]["must"].append(query_string)
        return query_base

-    def parse(self, response):
-        parsed = parse_results(response)
+    def parse(self, response, **kwargs):
+        parsed = parse_results(response, **kwargs)
        return parsed

    def run_query(self, user, search_query, **kwargs):
@@ -186,6 +197,127 @@ class ElasticsearchBackend(StorageBackend):
            return err
        return response

+    async def async_run_query(self, user, search_query, **kwargs):
+        """
+        Low level helper to run an ES query.
+        Accept a user to pass it to the filter, so we can
+        avoid filtering for superusers.
+        Accept fields and size, for the fields we want to match and the
+        number of results to return.
+        """
+        index = kwargs.get("index")
+        try:
+            response = self.client.search(body=search_query, index=index)
+        except RequestError as err:
+            print("Elasticsearch error", err)
+            return err
+        except NotFoundError as err:
+            print("Elasticsearch error", err)
+            return err
+        return response
+
+    async def schedule_query_results(self, rule_object):
+        """
+        Helper to run a scheduled query with reduced functionality and async.
+        """
+
+        data = rule_object.parsed
+
+        if "tags" in data:
+            tags = data["tags"]
+        else:
+            tags = []
+
+        if "query" in data:
+            query = data["query"][0]
+            data["query"] = query
+
+        result_map = {}
+
+        add_bool = []
+        add_top = []
+        if "source" in data:
+            total_count = len(data["source"])
+            total_sources = len(settings.MAIN_SOURCES) + len(
+                settings.SOURCES_RESTRICTED
+            )
+            if total_count != total_sources:
+                add_top_tmp = {"bool": {"should": []}}
+                for source_iter in data["source"]:
+                    add_top_tmp["bool"]["should"].append(
+                        {"match_phrase": {"src": source_iter}}
+                    )
+                add_top.append(add_top_tmp)
+        for field, values in data.items():
+            if field not in ["source", "index", "tags", "query", "sentiment"]:
+                for value in values:
+                    add_top.append({"match": {field: value}})
+        search_query = self.parse_query(data, tags, None, False, add_bool)
+        self.add_bool(search_query, add_bool)
+        self.add_top(search_query, add_top)
+        if "sentiment" in data:
+            search_query["aggs"] = {
+                "avg_sentiment": {
+                    "avg": {"field": "sentiment"},
+                }
+            }
+        for index in data["index"]:
+
+            if "message" in search_query:
+                self.log.error(f"Error parsing query: {search_query['message']}")
+                continue
+            response = await self.async_run_query(
+                rule_object.user,
+                search_query,
+                index=index,
+            )
+            if isinstance(response, Exception):
+                error = response.info["error"]["root_cause"][0]["reason"]
+                self.log.error(f"Error running scheduled search: {error}")
+                raise QueryError(error)
+            if len(response["hits"]["hits"]) == 0:
+                # No results, skip
+                continue
+            aggs, response = self.parse(response, aggs=True)
+            if "message" in response:
+                self.log.error(f"Error running scheduled search: {response['message']}")
+                continue
+            result_map[index] = (aggs, response)
+
+        # Average aggregation check
+        # Could probably do this in elasticsearch
+        for index, (aggs, result) in result_map.items():
+            # Default to true, if no aggs are found, we still want to match
+            match = True
+            for agg_name, (operator, number) in rule_object.aggs.items():
+                if agg_name in aggs:
+                    agg_value = aggs[agg_name]["value"]
+
+                    # TODO: simplify this, match is default to True
+                    if operator == ">":
+                        if agg_value > number:
+                            match = True
+                        else:
+                            match = False
+                    elif operator == "<":
+                        if agg_value < number:
+                            match = True
+                        else:
+                            match = False
+                    elif operator == "=":
+                        if agg_value == number:
+                            match = True
+                        else:
+                            match = False
+                    else:
+                        match = False
+                else:
+                    # No aggregation found, but it is required
+                    match = False
+                result_map[index][0][agg_name]["match"] = match
+
+        return result_map
+
    def query_results(
        self,
        request,
@@ -224,12 +356,12 @@ class ElasticsearchBackend(StorageBackend):
        else:
            sizes = settings.MAIN_SIZES
        if not size:
-            size = self.parse_size(query_params, sizes)
+            size = parse_size(query_params, sizes)
            if isinstance(size, dict):
                return size

        # I - Index
-        index = self.parse_index(request.user, query_params)
+        index = parse_index(request.user, query_params)
        if isinstance(index, dict):
            return index

@@ -242,7 +374,7 @@ class ElasticsearchBackend(StorageBackend):
            return search_query

        # S - Sources
-        sources = self.parse_source(request.user, query_params)
+        sources = parse_source(request.user, query_params)
        if isinstance(sources, dict):
            return sources
        total_count = len(sources)
@@ -257,7 +389,7 @@ class ElasticsearchBackend(StorageBackend):

        # R - Ranges
        # date_query = False
-        from_ts, to_ts = self.parse_date_time(query_params)
+        from_ts, to_ts = parse_date_time(query_params)
        if from_ts:
            range_query = {
                "range": {
@@ -270,7 +402,7 @@ class ElasticsearchBackend(StorageBackend):
            add_top.append(range_query)

        # S - Sort
-        sort = self.parse_sort(query_params)
+        sort = parse_sort(query_params)
        if isinstance(sort, dict):
            return sort
        if sort:
@@ -286,7 +418,7 @@ class ElasticsearchBackend(StorageBackend):
            search_query["sort"] = sorting

        # S - Sentiment
-        sentiment_r = self.parse_sentiment(query_params)
+        sentiment_r = parse_sentiment(query_params)
        if isinstance(sentiment_r, dict):
            return sentiment_r
        if sentiment_r: