neptune/core/lib/rules.py

from yaml import dump, load
from yaml.parser import ParserError
from yaml.scanner import ScannerError

try:
    from yaml import CDumper as Dumper
    from yaml import CLoader as Loader
except ImportError:
    from yaml import Loader, Dumper

import uuid
from datetime import datetime

import orjson
from siphashc import siphash

from core.lib.notify import sendmsg
from core.lib.parsing import parse_index, parse_source
from core.util import logs

log = logs.get_logger("rules")

SECONDS_PER_UNIT = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}

MAX_WINDOW = 2592000
MAX_AMOUNT_NTFY = 10
MAX_AMOUNT_WEBHOOK = 1000
HIGH_FREQUENCY_MIN_SEC = 60


class RuleParseError(Exception):
    def __init__(self, message, field):
        super().__init__(message)
        self.field = field


def format_ntfy(**kwargs):
    """
    Format a message for ntfy.
    If the message is a list, it will be joined with newlines.
    If the message is None, it will be replaced with an empty string.
    If specified, `matched` will be pretty-printed in the first line.
    kwargs:
        rule: The rule object, must be specified
        index: The index the rule matched on, can be None
        message: The message to send, can be None
        matched: The matched fields, can be None
        total_hits: The total number of matches, optional
    """
    rule = kwargs.get("rule")
    index = kwargs.get("index")
    message = kwargs.get("message")
    matched = kwargs.get("matched")
    total_hits = kwargs.get("total_hits", 0)
    if message:
        # Dump the message in YAML for readability
        messages_formatted = ""
        if isinstance(message, list):
            for message_iter in message:
                messages_formatted += dump(
                    message_iter, Dumper=Dumper, default_flow_style=False
                )
                messages_formatted += "\n"
        else:
            messages_formatted = dump(message, Dumper=Dumper, default_flow_style=False)
    else:
        messages_formatted = ""

    if matched:
        matched = ", ".join([f"{k}: {v}" for k, v in matched.items()])
    else:
        matched = ""

    notify_message = f"{rule.name} on {index}: {matched}\n{messages_formatted}"
    notify_message += f"\nTotal hits: {total_hits}"
    notify_message = notify_message.encode("utf-8", "replace")

    return notify_message


def format_webhook(**kwargs):
    """
    Format a message for a webhook.
    Adds some metadata to the message that would normally be only in
    notification_settings.
    Dumps the message in JSON.
    kwargs:
        rule: The rule object, must be specified
        index: The index the rule matched on, can be None
        message: The message to send, can be None, but will be sent as None
        matched: The matched fields, can be None, but will be sent as None
        total_hits: The total number of matches, optional
        notification_settings: The notification settings, must be specified
            priority: The priority of the message, optional
            topic: The topic of the message, optional
    """
    rule = kwargs.get("rule")
    index = kwargs.get("index")
    message = kwargs.get("message")
    matched = kwargs.get("matched")
    total_hits = kwargs.get("total_hits", 0)
    notification_settings = kwargs.get("notification_settings")
    notify_message = {
        "rule_id": rule.id,
        "rule_name": rule.name,
        "matched": matched,
        "total_hits": total_hits,
        "index": index,
        "data": message,
    }
    if "priority" in notification_settings:
        notify_message["priority"] = notification_settings["priority"]
    if "topic" in notification_settings:
        notify_message["topic"] = notification_settings["topic"]
    notify_message = orjson.dumps(notify_message)

    return notify_message


def rule_notify(rule, index, message, meta=None):
    """
    Send a notification for a matching rule.
    Gets the notification settings for the rule.
    Runs the formatting helpers for the service.
    :param rule: The rule object, must be specified
    :param index: The index the rule matched on, can be None
    :param message: The message to send, can be None
    :param meta: dict of metadata, contains `aggs` key for the matched fields
    """
    # If there is no message, don't say anything matched
    if message:
        word = "match"
    else:
        word = "no match"

    title = f"Rule {rule.name} {word} on {index}"

    # The user notification settings are merged in with this
    notification_settings = rule.get_notification_settings()
    if not notification_settings:
        # No/invalid notification settings, don't send anything
        return
    if notification_settings.get("service") == "none":
        # Don't send anything
        return

    # Create a cast we can reuse for the formatting helpers and sendmsg
    cast = {
        "title": title,
        "user": rule.user,
        "rule": rule,
        "index": index,
        "message": message,
        "notification_settings": notification_settings,
    }
    if meta:
        if "matched" in meta:
            cast["matched"] = meta["matched"]
        if "total_hits" in meta:
            cast["total_hits"] = meta["total_hits"]

    if rule.service == "ntfy":
        cast["msg"] = format_ntfy(**cast)

    elif rule.service == "webhook":
        cast["msg"] = format_webhook(**cast)

    sendmsg(**cast)


class NotificationRuleData(object):
    def __init__(self, user, cleaned_data, db):
        self.user = user
        self.object = None

        # We are running live and have been passed a database object
        if not isinstance(cleaned_data, dict):
            self.object = cleaned_data
            cleaned_data = cleaned_data.__dict__

        self.cleaned_data = cleaned_data
        self.db = db
        self.data = self.cleaned_data.get("data")
        self.window = self.cleaned_data.get("window")
        self.policy = self.cleaned_data.get("policy")
        self.parsed = None
        self.aggs = {}

        self.validate_user_permissions()

        self.parse_data()
        self.ensure_list()
        self.validate_permissions()
        self.validate_schedule_fields()
        self.validate_time_fields()
        if self.object is not None:
            self.populate_matched()

    def clear_database_matches(self):
        """
        Delete all matches for this rule.
        """
        rule_id = str(self.object.id)
        self.db.delete_rule_entries(rule_id)

    def populate_matched(self):
        """
        On first creation, the match field is None. We need to populate it with
        a dictionary containing the index names as keys and False as values.
        """
        if self.object.match is None:
            self.object.match = {}
        for index in self.parsed["index"]:
            if index not in self.object.match:
                self.object.match[index] = False
        self.object.save()

    def format_matched(self, messages):
        matched = {}
        for message in messages:
            for field, value in self.parsed.items():
                if field == "msg":
                    # Allow partial matches for msg
                    for msg in value:
                        if "msg" in message:
                            if msg.lower() in message["msg"].lower():
                                matched[field] = msg
                                # Break out of the msg matching loop
                                break
                    # Continue to next field
                    continue
                if field in message and message[field] in value:
                    # Do exact matches for all other fields
                    matched[field] = message[field]
        return matched

    def store_match(self, index, match):
        """
        Store a match result.
        Accepts None for the index to set all indices.
        :param index: the index to store the match for, can be None
        :param match: the object that matched
        """
        if match is not False:
            # Dump match to JSON while sorting the keys
            match_normalised = orjson.dumps(match, option=orjson.OPT_SORT_KEYS)
            match = siphash(self.db.hash_key, match_normalised)

        if self.object.match is None:
            self.object.match = {}
        if not isinstance(self.object.match, dict):
            self.object.match = {}

        if index is None:
            for index_iter in self.parsed["index"]:
                self.object.match[index_iter] = match
        else:
            self.object.match[index] = match
        self.object.save()
        log.debug(f"Stored match: {index} - {match}")

    def get_match(self, index=None, match=None):
        """
        Get a match result for an index.
        If the index is None, it will return True if any index has a match.
        :param index: the index to get the match for, can be None
        """
        if self.object.match is None:
            self.object.match = {}
            self.object.save()
            return None
        if not isinstance(self.object.match, dict):
            return None

        if index is None:
            # Check if we have any matches on all indices
            values = self.object.match.values()
            if not values:
                return None
            return any(values)

        # Check if it's the same hash
        if match is not None:
            match_normalised = orjson.dumps(match, option=orjson.OPT_SORT_KEYS)
            match = siphash(self.db.hash_key, match_normalised)
            hash_matches = self.object.match.get(index) == match
            return hash_matches

        returned_match = self.object.match.get(index, None)
        if type(returned_match) == int:
            # We are getting a hash from the database,
            # but we have nothing to check it against.
            # In this instance, we are checking if we got a match
            # at all last time. We can confidently say that since
            # we have a hash, we did.
            returned_match = True
        return returned_match

    def format_aggs(self, aggs):
        """
        Format aggregations for the query.
        We have self.aggs, which contains:
        {"avg_sentiment": (">", 0.5)}
        and aggs, which contains:
        {"avg_sentiment": {"value": 0.6}}
        It's matched already, we just need to format it like so:
        {"avg_sentiment": "0.06>0.5"}
        :param aggs: the aggregations to format
        :return: the formatted aggregations
        """
        new_aggs = {}
        for agg_name, agg in aggs.items():
            if agg_name in self.aggs:
                op, value = self.aggs[agg_name]
                new_aggs[agg_name] = f"{agg['value']}{op}{value}"

        return new_aggs

    def reform_matches(self, index, matches, meta, mode):
        if not isinstance(matches, list):
            matches = [matches]
        matches_copy = matches.copy()
        match_ts = datetime.utcnow().isoformat()
        batch_id = uuid.uuid4()

        # Filter empty fields in meta
        meta = {k: v for k, v in meta.items() if v}

        for match_index, _ in enumerate(matches_copy):
            matches_copy[match_index]["index"] = index
            matches_copy[match_index]["rule_id"] = str(self.object.id)
            matches_copy[match_index]["meta"] = meta
            matches_copy[match_index]["match_ts"] = match_ts
            matches_copy[match_index]["mode"] = mode
            matches_copy[match_index]["batch_id"] = str(batch_id)
        return matches_copy

    async def ingest_matches(self, index, matches, meta, mode):
        """
        Store all matches for an index.
        :param index: the index to store the matches for
        :param matches: the matches to store
        """
        new_matches = self.reform_matches(index, matches, meta, mode)
        await self.db.async_store_matches(new_matches)

    def ingest_matches_sync(self, index, matches, meta, mode):
        """
        Store all matches for an index.
        :param index: the index to store the matches for
        :param matches: the matches to store
        """
        new_matches = self.reform_matches(index, matches, meta, mode)
        self.db.store_matches(new_matches)

    async def rule_matched(self, index, message, meta, mode):
        """
        A rule has matched.
        If the previous run did not match, send a notification after formatting
        the aggregations.
        :param index: the index the rule matched on
        :param message: the message object that matched
        :param aggs: the aggregations that matched
        """
        current_match = self.get_match(index, message)
        log.debug(f"Rule matched: {index} - current match: {current_match}")

        last_run_had_matches = current_match is True

        if self.policy in ["change", "default"]:
            # Change or Default policy, notifying only on new results
            if last_run_had_matches:
                # Last run had matches, and this one did too
                # We don't need to notify
                return

        elif self.policy == "always":
            # Only here for completeness, we notify below by default
            pass

        # We hit the return above if we don't need to notify
        if "matched" not in meta:
            meta["matched"] = self.format_matched(message)
        if "aggs" in meta:
            aggs_formatted = self.format_aggs(meta["aggs"])
            if aggs_formatted:
                meta["matched_aggs"] = aggs_formatted

        rule_notify(self.object, index, message, meta)
        self.store_match(index, message)
        await self.ingest_matches(index, message, meta, mode)

    def rule_matched_sync(self, index, message, meta, mode):
        """
        A rule has matched.
        If the previous run did not match, send a notification after formatting
        the aggregations.
        :param index: the index the rule matched on
        :param message: the message object that matched
        :param aggs: the aggregations that matched
        """
        current_match = self.get_match(index, message)
        log.debug(f"Rule matched: {index} - current match: {current_match}")

        last_run_had_matches = current_match is True

        if self.policy in ["change", "default"]:
            # Change or Default policy, notifying only on new results
            if last_run_had_matches:
                # Last run had matches, and this one did too
                # We don't need to notify
                return

        elif self.policy == "always":
            # Only here for completeness, we notify below by default
            pass

        # We hit the return above if we don't need to notify
        if "matched" not in meta:
            meta["matched"] = self.format_matched(message)
        if "aggs" in meta:
            aggs_formatted = self.format_aggs(meta["aggs"])
            if aggs_formatted:
                meta["matched_aggs"] = aggs_formatted

        rule_notify(self.object, index, message, meta)
        self.store_match(index, message)
        self.ingest_matches_sync(index, message, meta, mode)

    # No async helper for this one as we only need it for schedules
    async def rule_no_match(self, index=None, message=None):
        """
        A rule has not matched.
        If the previous run did match, send a notification if configured to notify
        for empty matches.
        :param index: the index the rule did not match on, can be None

        """
        current_match = self.get_match(index)
        log.debug(
            f"Rule not matched: {index} - current match: {current_match}: {message}"
        )

        last_run_had_matches = current_match is True
        initial = current_match is None

        self.store_match(index, False)

        if self.policy != "always":
            # We hit the return above if we don't need to notify
            if self.policy in ["change", "default"]:
                if not last_run_had_matches and not initial:
                    # We don't need to notify if the last run didn't have matches
                    return

        if self.policy in ["always", "change"]:
            # Never notify for empty matches on default policy
            rule_notify(self.object, index, "no_match", None)
            await self.ingest_matches(
                index=index,
                matches=[{"msg": None}],
                meta={"msg": message},
                mode="schedule",
            )

    async def run_schedule(self):
        """
        Run the schedule query.
        Get the results from the database, and check if the rule has matched.
        Check if all of the required aggregations have matched.
        """
        response = await self.db.schedule_query_results(self)
        if not response:
            # No results in the result_map
            await self.rule_no_match(message="No response from database")
            return
        for index, (meta, results) in response.items():
            if not results:
                # Falsy results, no matches
                await self.rule_no_match(index, message="No results for index")
                continue

            # Add the match values of all aggregations to a list
            aggs_for_index = []
            for agg_name in self.aggs.keys():
                if agg_name in meta["aggs"]:
                    if "match" in meta["aggs"][agg_name]:
                        aggs_for_index.append(meta["aggs"][agg_name]["match"])

            # All required aggs are present
            if len(aggs_for_index) == len(self.aggs.keys()):
                if all(aggs_for_index):
                    # All aggs have matched
                    await self.rule_matched(
                        index, results[: self.object.amount], meta, mode="schedule"
                    )
                    continue
            # Default branch, since the happy path has a continue keyword
            await self.rule_no_match(index, message="Aggregation did not match")

    def test_schedule(self):
        """
        Test the schedule query to ensure it is valid.
        Raises an exception if the query is invalid.
        """
        if self.db:
            self.db.schedule_query_results_test_sync(self)

    def validate_schedule_fields(self):
        """
        Ensure schedule fields are valid.
        index: can be a list, it will schedule one search per index.
        source: can be a list, it will be the filter for each search.
        tokens: can be list, it will ensure the message matches any token.
        msg: can be a list, it will ensure the message contains any msg.
        No other fields can be lists containing more than one item.
        :raises RuleParseError: if the fields are invalid
        """
        is_schedule = self.is_schedule

        if is_schedule:
            allowed_list_fields = ["index", "source", "tokens", "msg"]
            for field, value in self.parsed.items():
                if field not in allowed_list_fields:
                    if len(value) > 1:
                        raise RuleParseError(
                            (
                                f"For scheduled rules, field {field} cannot contain "
                                "more than one item"
                            ),
                            "data",
                        )
                    if len(str(value[0])) == 0:
                        raise RuleParseError(f"Field {field} cannot be empty", "data")
            if "sentiment" in self.parsed:
                sentiment = str(self.parsed["sentiment"][0])
                sentiment = sentiment.strip()
                if sentiment[0] not in [">", "<", "="]:
                    raise RuleParseError(
                        (
                            "Sentiment field must be a comparison operator and then a "
                            "float: >0.02"
                        ),
                        "data",
                    )
                operator = sentiment[0]
                number = sentiment[1:]

                try:
                    number = float(number)
                except ValueError:
                    raise RuleParseError(
                        (
                            "Sentiment field must be a comparison operator and then a "
                            "float: >0.02"
                        ),
                        "data",
                    )
                self.aggs["avg_sentiment"] = (operator, number)

        else:
            if "query" in self.parsed:
                raise RuleParseError(
                    "Field query cannot be used with on-demand rules", "data"
                )
            if "tags" in self.parsed:
                raise RuleParseError(
                    "Field tags cannot be used with on-demand rules", "data"
                )
            if self.policy != "default":
                raise RuleParseError(
                    (
                        f"Cannot use {self.cleaned_data['policy']} policy with "
                        "on-demand rules"
                    ),
                    "policy",
                )

    @property
    def is_schedule(self):
        """
        Check if the rule is a schedule rule.
        :return: True if the rule is a schedule rule, False otherwise
        """
        if "interval" in self.cleaned_data:
            if self.cleaned_data["interval"] != 0:
                return True
        return False

    def ensure_list(self):
        """
        Ensure all values in the data field are lists.
        Convert all strings to lists with one item.
        """
        for field, value in self.parsed.items():
            if not isinstance(value, list):
                self.parsed[field] = [value]

    def validate_user_permissions(self):
        """
        Ensure the user can use notification rules.
        :raises RuleParseError: if the user does not have permission
        """
        if not self.user.has_perm("core.use_rules"):
            raise RuleParseError("User does not have permission to use rules", "data")

    def validate_time_fields(self):
        """
        Validate the interval and window fields.
        Prohibit window being specified with an ondemand interval.
        Prohibit window not being specified with a non-ondemand interval.
        Prohibit amount being specified with an on-demand interval.
        Prohibut amount not being specified with a non-ondemand interval.
        Validate window field.
        Validate window unit and enforce maximum.
        :raises RuleParseError: if the fields are invalid
        """
        interval = self.cleaned_data.get("interval")
        window = self.cleaned_data.get("window")
        amount = self.cleaned_data.get("amount")
        service = self.cleaned_data.get("service")

        on_demand = interval == 0

        # Not on demand and interval is too low
        if not on_demand and interval <= HIGH_FREQUENCY_MIN_SEC:
            if not self.user.has_perm("core.rules_high_frequency"):
                raise RuleParseError(
                    "User does not have permission to use high frequency rules", "data"
                )

        if not on_demand:
            if not self.user.has_perm("core.rules_scheduled"):
                raise RuleParseError(
                    "User does not have permission to use scheduled rules", "data"
                )

        if on_demand and window is not None:
            # Interval is on demand and window is specified
            # We can't have a window with on-demand rules
            raise RuleParseError(
                "Window cannot be specified with on-demand interval", "window"
            )

        if not on_demand and window is None:
            # Interval is not on demand and window is not specified
            # We can't have a non-on-demand interval without a window
            raise RuleParseError(
                "Window must be specified with non-on-demand interval", "window"
            )

        if not on_demand and amount is None:
            # Interval is not on demand and amount is not specified
            # We can't have a non-on-demand interval without an amount
            raise RuleParseError(
                "Amount must be specified with non-on-demand interval", "amount"
            )
        if on_demand and amount is not None:
            # Interval is on demand and amount is specified
            # We can't have an amount with on-demand rules
            raise RuleParseError(
                "Amount cannot be specified with on-demand interval", "amount"
            )

        if window is not None:
            window_number = window[:-1]
            if not window_number.isdigit():
                raise RuleParseError("Window prefix must be a number", "window")
            window_number = int(window_number)
            window_unit = window[-1]
            if window_unit not in SECONDS_PER_UNIT:
                raise RuleParseError(
                    (
                        "Window unit must be one of "
                        f"{', '.join(SECONDS_PER_UNIT.keys())},"
                        f" not '{window_unit}'"
                    ),
                    "window",
                )
            window_seconds = window_number * SECONDS_PER_UNIT[window_unit]
            if window_seconds > MAX_WINDOW:
                raise RuleParseError(
                    f"Window cannot be larger than {MAX_WINDOW} seconds (30 days)",
                    "window",
                )

        if amount is not None:
            if service == "ntfy":
                if amount > MAX_AMOUNT_NTFY:
                    raise RuleParseError(
                        f"Amount cannot be larger than {MAX_AMOUNT_NTFY} for ntfy",
                        "amount",
                    )
            else:
                if amount > MAX_AMOUNT_WEBHOOK:
                    raise RuleParseError(
                        (
                            f"Amount cannot be larger than {MAX_AMOUNT_WEBHOOK} for "
                            f"{service}"
                        ),
                        "amount",
                    )

    def validate_permissions(self):
        """
        Validate permissions for the source and index variables.
        Also set the default values for the user if not present.
        Stores the default or expanded values in the parsed field.
        :raises QueryError: if the user does not have permission to use the source
        """
        if "index" in self.parsed:
            index = self.parsed["index"]
            if type(index) == list:
                for i in index:
                    parse_index(self.user, {"index": i}, raise_error=True)
            # else:
            #     db.parse_index(self.user, {"index": index}, raise_error=True)
        else:
            # Get the default value for the user if not present
            index = parse_index(self.user, {}, raise_error=True)
            self.parsed["index"] = [index]

        if "source" in self.parsed:
            source = self.parsed["source"]
            if type(source) == list:
                for i in source:
                    parse_source(self.user, {"source": i}, raise_error=True)
            # else:
            #     parse_source(self.user, {"source": source}, raise_error=True)
        else:
            # Get the default value for the user if not present
            source = parse_source(self.user, {}, raise_error=True)
            self.parsed["source"] = source

    def parse_data(self):
        """
        Parse the data in the text field to YAML.
        :raises RuleParseError: if the data is invalid
        """
        try:
            self.parsed = load(self.data, Loader=Loader)
        except (ScannerError, ParserError) as e:
            raise RuleParseError(f"Invalid YAML: {e}", "data")

    def __str__(self):
        """
        Get a YAML representation of the data field of the rule.
        """
        return dump(self.parsed, Dumper=Dumper)

    def get_data(self):
        """
        Return the data field as a dictionary.
        """
        return self.parsed