neptune/core/db/processing.py

from datetime import datetime

from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online


def annotate_results(results):
    """
    Accept a list of dict objects, search for the number of channels and users.
    Add them to the object.
    Mutate it in place. Does not return anything.
    """
    # Figure out items with net (not discord)
    nets = set()
    for x in results:
        if "net" in x:
            nets.add(x["net"])

    for net in nets:
        # Annotate the online attribute from Threshold
        nicks = list(
            set(
                [
                    x["nick"]
                    for x in results
                    if {"nick", "src", "net"}.issubset(x)
                    and x["src"] == "irc"
                    and x["net"] == net
                ]
            )
        )
        channels = list(
            set(
                [
                    x["channel"]
                    for x in results
                    if {"channel", "src", "net"}.issubset(x)
                    and x["src"] == "irc"
                    and x["net"] == net
                ]
            )
        )
        online_info = annotate_online(net, nicks)
        # Annotate the number of users in the channel
        num_users = annotate_num_users(net, channels)
        # Annotate the number channels the user is on
        num_chans = annotate_num_chans(net, nicks)
        for item in results:
            if "net" in item:
                if item["net"] == net:
                    if "nick" in item:
                        if item["nick"] in online_info:
                            item["online"] = online_info[item["nick"]]
                    if "channel" in item:
                        if item["channel"] in num_users:
                            item["num_users"] = num_users[item["channel"]]
                    if "nick" in item:
                        if item["nick"] in num_chans:
                            item["num_chans"] = num_chans[item["nick"]]


def parse_results(results, aggs=None):
    results_parsed = []
    stringify = ["host", "channel"]
    if "hits" in results.keys():
        if "hits" in results["hits"]:
            for item in results["hits"]["hits"]:
                if "_source" in item.keys():
                    data_index = "_source"
                elif "fields" in item.keys():
                    data_index = "fields"
                else:
                    return False
                element = item[data_index]
                for field in stringify:
                    if field in element:
                        element[field] = str(element[field])
                # Why are fields in lists...
                if data_index == "fields":
                    element = {k: v[0] for k, v in element.items() if len(v)}
                element["id"] = item["_id"]

                # Remove empty values
                for field in list(element.keys()):
                    if element[field] == "":
                        del element[field]

                # Split the timestamp into date and time
                if "ts" not in element:
                    if "time" in element:  # will fix data later
                        ts = element["time"]
                        del element["time"]
                        element["ts"] = ts
                if "ts" in element:
                    if isinstance(element["ts"], str):
                        ts = element["ts"]
                    else:
                        ts = datetime.utcfromtimestamp(element["ts"]).strftime(
                            "%Y-%m-%dT%H:%M:%S"
                        )
                    ts_spl = ts.split("T")
                    date = ts_spl[0]
                    time = ts_spl[1]
                    element["date"] = date
                    if "." in time:
                        time_spl = time.split(".")
                        if len(time_spl) == 2:
                            element["time"] = time.split(".")[0]
                        else:
                            element["time"] = time
                    else:
                        element["time"] = time
                results_parsed.append(element)
    if aggs:
        aggregations = {}
        if "aggregations" in results:
            for field in ["avg_sentiment"]:  # Add other number fields here
                if field in results["aggregations"]:
                    aggregations[field] = results["aggregations"][field]
        return (aggregations, results_parsed)

    return results_parsed


def parse_druid(response):
    results_parsed = []
    for item in response:
        if "events" in item:
            for event in item["events"]:
                results_parsed.append(event)
        else:
            raise Exception(f"events not in item {item}")
    return results_parsed
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`from datetime import datetime`

Remove some debugging code 2022-09-06 11:18:58 +00:00			`from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online`

Fix some Manticore queries 2022-09-06 10:53:32 +00:00
Fix annotating results and remove debugging code 2022-11-23 18:39:36 +00:00			`def annotate_results(results):`
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`"""`
			`Accept a list of dict objects, search for the number of channels and users.`
			`Add them to the object.`
			`Mutate it in place. Does not return anything.`
			`"""`
			`# Figure out items with net (not discord)`
			`nets = set()`
Fix annotating results and remove debugging code 2022-11-23 18:39:36 +00:00			`for x in results:`
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`if "net" in x:`
			`nets.add(x["net"])`

			`for net in nets:`
			`# Annotate the online attribute from Threshold`
			`nicks = list(`
			`set(`
			`[`
			`x["nick"]`
Fix annotating results and remove debugging code 2022-11-23 18:39:36 +00:00			`for x in results`
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`if {"nick", "src", "net"}.issubset(x)`
			`and x["src"] == "irc"`
			`and x["net"] == net`
			`]`
			`)`
			`)`
			`channels = list(`
			`set(`
			`[`
			`x["channel"]`
Fix annotating results and remove debugging code 2022-11-23 18:39:36 +00:00			`for x in results`
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`if {"channel", "src", "net"}.issubset(x)`
			`and x["src"] == "irc"`
			`and x["net"] == net`
			`]`
			`)`
			`)`
			`online_info = annotate_online(net, nicks)`
			`# Annotate the number of users in the channel`
			`num_users = annotate_num_users(net, channels)`
			`# Annotate the number channels the user is on`
			`num_chans = annotate_num_chans(net, nicks)`
Fix annotating results and remove debugging code 2022-11-23 18:39:36 +00:00			`for item in results:`
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`if "net" in item:`
			`if item["net"] == net:`
			`if "nick" in item:`
			`if item["nick"] in online_info:`
			`item["online"] = online_info[item["nick"]]`
			`if "channel" in item:`
			`if item["channel"] in num_users:`
			`item["num_users"] = num_users[item["channel"]]`
			`if "nick" in item:`
			`if item["nick"] in num_chans:`
			`item["num_chans"] = num_chans[item["nick"]]`


Fix window/interval validation and make aggs optional in parse_results 2023-01-15 20:27:19 +00:00			`def parse_results(results, aggs=None):`
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`results_parsed = []`
			`stringify = ["host", "channel"]`
			`if "hits" in results.keys():`
			`if "hits" in results["hits"]:`
			`for item in results["hits"]["hits"]:`
			`if "_source" in item.keys():`
			`data_index = "_source"`
			`elif "fields" in item.keys():`
			`data_index = "fields"`
			`else:`
			`return False`
			`element = item[data_index]`
			`for field in stringify:`
			`if field in element:`
			`element[field] = str(element[field])`
			`# Why are fields in lists...`
			`if data_index == "fields":`
			`element = {k: v[0] for k, v in element.items() if len(v)}`
			`element["id"] = item["_id"]`

			`# Remove empty values`
			`for field in list(element.keys()):`
			`if element[field] == "":`
			`del element[field]`

			`# Split the timestamp into date and time`
			`if "ts" not in element:`
			`if "time" in element: # will fix data later`
			`ts = element["time"]`
			`del element["time"]`
			`element["ts"] = ts`
			`if "ts" in element:`
			`if isinstance(element["ts"], str):`
			`ts = element["ts"]`
			`else:`
Remove some debugging code 2022-09-06 11:18:58 +00:00			`ts = datetime.utcfromtimestamp(element["ts"]).strftime(`
			`"%Y-%m-%dT%H:%M:%S"`
			`)`
Fix some Manticore queries 2022-09-06 10:53:32 +00:00			`ts_spl = ts.split("T")`
			`date = ts_spl[0]`
			`time = ts_spl[1]`
			`element["date"] = date`
			`if "." in time:`
			`time_spl = time.split(".")`
			`if len(time_spl) == 2:`
			`element["time"] = time.split(".")[0]`
			`else:`
			`element["time"] = time`
			`else:`
			`element["time"] = time`
			`results_parsed.append(element)`
Implement running scheduled rules and check aggregations 2023-01-15 17:59:12 +00:00			`if aggs:`
			`aggregations = {}`
			`if "aggregations" in results:`
			`for field in ["avg_sentiment"]: # Add other number fields here`
			`if field in results["aggregations"]:`
			`aggregations[field] = results["aggregations"][field]`
			`return (aggregations, results_parsed)`

Remove some debugging code 2022-09-06 11:18:58 +00:00			`return results_parsed`
Begin implementing DB framework 2022-09-27 14:15:08 +00:00

			`def parse_druid(response):`
			`results_parsed = []`
			`for item in response:`
			`if "events" in item:`
			`for event in item["events"]:`
			`results_parsed.append(event)`
			`else:`
			`raise Exception(f"events not in item {item}")`
			`return results_parsed`