neptune/core/lib/opensearch.py

from django.conf import settings
from opensearchpy import OpenSearch
from opensearchpy.exceptions import RequestError

from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online


def initialise_opensearch():
    """
    Inititialise the OpenSearch API endpoint.
    """
    auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD)
    client = OpenSearch(
        # fmt: off
        hosts=[{"host": settings.OPENSEARCH_URL,
                "port": settings.OPENSEARCH_PORT}],
        http_compress=False,  # enables gzip compression for request bodies
        http_auth=auth,
        # client_cert = client_cert_path,
        # client_key = client_key_path,
        use_ssl=settings.OPENSEARCH_TLS,
        verify_certs=False,
        ssl_assert_hostname=False,
        ssl_show_warn=False,
        # a_certs=ca_certs_path,
    )
    return client


client = initialise_opensearch()


def annotate_results(results_parsed):
    """
    Accept a list of dict objects, search for the number of channels and users.
    Add them to the object.
    Mutate it in place. Does not return anything.
    """
    # Figure out items with net (not discord)
    nets = set()
    for x in results_parsed:
        if "net" in x:
            nets.add(x["net"])

    for net in nets:
        # Annotate the online attribute from Threshold
        nicks = [
            x["nick"] for x in results_parsed if x["src"] == "irc" and x["net"] == net
        ]
        channels = [
            x["channel"]
            for x in results_parsed
            if x["src"] == "irc" and x["net"] == net
        ]
        online_info = annotate_online(net, nicks)
        # Annotate the number of users in the channel
        num_users = annotate_num_users(net, channels)
        # Annotate the number channels the user is on
        num_chans = annotate_num_chans(net, nicks)
        for item in results_parsed:
            if "net" in item:
                if item["net"] == net:
                    if "nick" in item:
                        if item["nick"] in online_info:
                            item["online"] = online_info[item["nick"]]
                    if "channel" in item:
                        if item["channel"] in num_users:
                            item["num_users"] = num_users[item["channel"]]
                    if "nick" in item:
                        if item["nick"] in num_chans:
                            item["num_chans"] = num_chans[item["nick"]]


def filter_blacklisted(user, response):
    """
    Low level filter to take the raw OpenSearch response and remove
    objects from it we want to keep secret.
    Does not return, the object is mutated in place.
    """
    response["redacted"] = 0
    response["exemption"] = None
    if user.is_superuser:
        response["exemption"] = True
    # is_anonymous = isinstance(user, AnonymousUser)
    # For every hit from ES
    for index, item in enumerate(list(response["hits"]["hits"])):
        # For every blacklisted type
        for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
            # Check this field we are matching exists
            if blacklisted_type in item["_source"].keys():
                content = item["_source"][blacklisted_type]
                # For every item in the blacklisted array for the type
                for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
                    blacklisted_type
                ]:
                    if blacklisted_item == str(content):
                        # Remove the item
                        if item in response["hits"]["hits"]:
                            # Anonymous
                            if user.is_anonymous:
                                response["hits"]["hits"].remove(item)
                            else:
                                if not user.is_superuser:
                                    response["hits"]["hits"].remove(item)
                                else:
                                    response["hits"]["hits"][index]["_source"][
                                        "exemption"
                                    ] = True

                            # Let the UI know something was redacted
                            response["redacted"] += 1


def run_main_query(client, user, query, custom_query=False, index=None, size=None):
    """
    Low level helper to run an ES query.
    Accept a user to pass it to the filter, so we can
    avoid filtering for superusers.
    Accept fields and size, for the fields we want to match and the
    number of results to return.
    """
    if not index:
        index = settings.OPENSEARCH_INDEX_MAIN
    if custom_query:
        search_query = query
    else:
        search_query = construct_query(query, size)
    try:
        response = client.search(body=search_query, index=index)
    except RequestError as err:
        print("OpenSearch error", err)
        return False
    filter_blacklisted(user, response)
    return response


def query_results(request, size=None):
    """
    API helper to alter the OpenSearch return format into something
    a bit better to parse.
    Accept a HTTP request object. Run the query, and annotate the
    results with the other data we have.
    """
    # is_anonymous = isinstance(request.user, AnonymousUser)
    message = None
    message_class = None
    add_bool = []
    if request.user.is_anonymous:
        sizes = settings.OPENSEARCH_MAIN_SIZES_ANON
    else:
        sizes = settings.OPENSEARCH_MAIN_SIZES
    if not size:
        if "size" in request.POST:
            size = request.POST["size"]
            if size not in sizes:
                message = "Size is not permitted"
                message_class = "danger"
                return {"message": message, "class": message_class}
    if "source" in request.POST:
        source = request.POST["source"]
        if source not in settings.OPENSEARCH_MAIN_SOURCES:
            message = "Invalid source"
            message_class = "danger"
            return {"message": message, "class": message_class}
        if source != "all":
            add_bool.append({"src": source})

    if "check-sentiment" in request.POST:
        if "sentiment" in request.POST:
            sentiment = request.POST["sentiment"]
            try:
                sentiment = float(sentiment)
            except ValueError:
                message = "Sentiment is not a float"
                message_class = "danger"
                return {"message": message, "class": message_class}

    if "query" in request.POST:
        query = request.POST["query"]
        search_query = construct_query(query, size)
        if add_bool:
            for item in add_bool:
                search_query["query"]["bool"]["must"].append({"match": item})
        results = run_main_query(
            client,
            request.user,  # passed through run_main_query to filter_blacklisted
            search_query,
            custom_query=True,
            size=size,
        )
        if not results:
            return False
        results_parsed = []
        if "hits" in results.keys():
            if "hits" in results["hits"]:
                for item in results["hits"]["hits"]:
                    element = item["_source"]
                    element["id"] = item["_id"]

                    # Split the timestamp into date and time
                    ts = element["ts"]
                    ts_spl = ts.split("T")
                    date = ts_spl[0]
                    time = ts_spl[1]
                    element["date"] = date
                    element["time"] = time
                    results_parsed.append(element)

        annotate_results(results_parsed)

        context = {
            "query": query,
            "results": results_parsed,
            "card": results["hits"]["total"]["value"],
            "took": results["took"],
            "redacted": results["redacted"],
            "exemption": results["exemption"],
        }
        return context


def query_single_result(request):
    context = query_results(request, 1)
    dedup_set = {item["nick"] for item in context["results"]}
    if dedup_set:
        context["item"] = context["results"][0]

    return (1, context)


def construct_query(query, size):
    """
    Accept some query parameters and construct an OpenSearch query.
    """
    if not size:
        size = 5
    query = {
        "size": size,
        "query": {
            "bool": {
                "must": [
                    {
                        "query_string": {
                            "query": query,
                            # "fields": fields,
                            # "default_field": "msg",
                            # "type": "best_fields",
                            "fuzziness": "AUTO",
                            "fuzzy_transpositions": True,
                            "fuzzy_max_expansions": 50,
                            "fuzzy_prefix_length": 0,
                            # "minimum_should_match": 1,
                            "default_operator": "or",
                            "analyzer": "standard",
                            "lenient": True,
                            "boost": 1,
                            "allow_leading_wildcard": True,
                            # "enable_position_increments": False,
                            "phrase_slop": 3,
                            # "max_determinized_states": 10000,
                            "quote_field_suffix": "",
                            "quote_analyzer": "standard",
                            "analyze_wildcard": False,
                            "auto_generate_synonyms_phrase_query": True,
                        }
                    }
                ]
            }
        },
        "sort": [
            {
                "ts": {
                    "order": "desc",
                }
            }
        ],
    }
    return query
Add opensearch library 2022-07-21 12:47:02 +00:00			`from django.conf import settings`
			`from opensearchpy import OpenSearch`
Gracefully handle invalid queries 2022-07-21 12:51:27 +00:00			`from opensearchpy.exceptions import RequestError`
Add opensearch library 2022-07-21 12:47:02 +00:00
Implement Insights page 2022-07-21 12:51:55 +00:00			`from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online`

Add opensearch library 2022-07-21 12:47:02 +00:00
			`def initialise_opensearch():`
Implement Insights page 2022-07-21 12:51:55 +00:00			`"""`
			`Inititialise the OpenSearch API endpoint.`
			`"""`
Add opensearch library 2022-07-21 12:47:02 +00:00			`auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD)`
			`client = OpenSearch(`
Reformat OpenSearch 2022-07-21 12:47:10 +00:00			`# fmt: off`
Add opensearch library 2022-07-21 12:47:02 +00:00			`hosts=[{"host": settings.OPENSEARCH_URL,`
Reformat OpenSearch 2022-07-21 12:47:10 +00:00			`"port": settings.OPENSEARCH_PORT}],`
Add opensearch library 2022-07-21 12:47:02 +00:00			`http_compress=False, # enables gzip compression for request bodies`
			`http_auth=auth,`
			`# client_cert = client_cert_path,`
			`# client_key = client_key_path,`
			`use_ssl=settings.OPENSEARCH_TLS,`
			`verify_certs=False,`
			`ssl_assert_hostname=False,`
			`ssl_show_warn=False,`
			`# a_certs=ca_certs_path,`
			`)`
			`return client`


Implement Insights page 2022-07-21 12:51:55 +00:00			`client = initialise_opensearch()`


			`def annotate_results(results_parsed):`
			`"""`
			`Accept a list of dict objects, search for the number of channels and users.`
			`Add them to the object.`
			`Mutate it in place. Does not return anything.`
			`"""`
			`# Figure out items with net (not discord)`
			`nets = set()`
			`for x in results_parsed:`
			`if "net" in x:`
			`nets.add(x["net"])`

			`for net in nets:`
			`# Annotate the online attribute from Threshold`
Properly handle networks when looking up users' online status 2022-07-29 16:51:19 +00:00			`nicks = [`
			`x["nick"] for x in results_parsed if x["src"] == "irc" and x["net"] == net`
			`]`
			`channels = [`
			`x["channel"]`
			`for x in results_parsed`
			`if x["src"] == "irc" and x["net"] == net`
			`]`
			`online_info = annotate_online(net, nicks)`
Implement Insights page 2022-07-21 12:51:55 +00:00			`# Annotate the number of users in the channel`
Properly handle networks when looking up users' online status 2022-07-29 16:51:19 +00:00			`num_users = annotate_num_users(net, channels)`
Implement Insights page 2022-07-21 12:51:55 +00:00			`# Annotate the number channels the user is on`
Properly handle networks when looking up users' online status 2022-07-29 16:51:19 +00:00			`num_chans = annotate_num_chans(net, nicks)`
Implement Insights page 2022-07-21 12:51:55 +00:00			`for item in results_parsed:`
Implement adding the next relay 2022-07-29 21:41:53 +00:00			`if "net" in item:`
			`if item["net"] == net:`
			`if "nick" in item:`
			`if item["nick"] in online_info:`
			`item["online"] = online_info[item["nick"]]`
			`if "channel" in item:`
			`if item["channel"] in num_users:`
			`item["num_users"] = num_users[item["channel"]]`
			`if "nick" in item:`
			`if item["nick"] in num_chans:`
			`item["num_chans"] = num_chans[item["nick"]]`
Implement Insights page 2022-07-21 12:51:55 +00:00

			`def filter_blacklisted(user, response):`
			`"""`
			`Low level filter to take the raw OpenSearch response and remove`
			`objects from it we want to keep secret.`
			`Does not return, the object is mutated in place.`
			`"""`
			`response["redacted"] = 0`
			`response["exemption"] = None`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`if user.is_superuser:`
			`response["exemption"] = True`
			`# is_anonymous = isinstance(user, AnonymousUser)`
Implement Insights page 2022-07-21 12:51:55 +00:00			`# For every hit from ES`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`for index, item in enumerate(list(response["hits"]["hits"])):`
Implement Insights page 2022-07-21 12:51:55 +00:00			`# For every blacklisted type`
			`for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():`
			`# Check this field we are matching exists`
			`if blacklisted_type in item["_source"].keys():`
			`content = item["_source"][blacklisted_type]`
			`# For every item in the blacklisted array for the type`
			`for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[`
			`blacklisted_type`
			`]:`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`if blacklisted_item == str(content):`
Implement Insights page 2022-07-21 12:51:55 +00:00			`# Remove the item`
			`if item in response["hits"]["hits"]:`
Make Drilldown public 2022-08-02 21:22:22 +00:00			`# Anonymous`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`if user.is_anonymous:`
Implement Insights page 2022-07-21 12:51:55 +00:00			`response["hits"]["hits"].remove(item)`
Make Drilldown public 2022-08-02 21:22:22 +00:00			`else:`
			`if not user.is_superuser:`
			`response["hits"]["hits"].remove(item)`
Make search public and refine blacklisting 2022-08-02 21:22:22 +00:00			`else:`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`response["hits"]["hits"][index]["_source"][`
			`"exemption"`
			`] = True`

Implement Insights page 2022-07-21 12:51:55 +00:00			`# Let the UI know something was redacted`
			`response["redacted"] += 1`


Implement meta search 2022-07-21 12:52:41 +00:00			`def run_main_query(client, user, query, custom_query=False, index=None, size=None):`
Implement Insights page 2022-07-21 12:51:55 +00:00			`"""`
			`Low level helper to run an ES query.`
			`Accept a user to pass it to the filter, so we can`
			`avoid filtering for superusers.`
			`Accept fields and size, for the fields we want to match and the`
			`number of results to return.`
			`"""`
Implement meta search 2022-07-21 12:52:41 +00:00			`if not index:`
			`index = settings.OPENSEARCH_INDEX_MAIN`
Implement more elements on Insights page 2022-07-21 12:52:10 +00:00			`if custom_query:`
			`search_query = query`
			`else:`
			`search_query = construct_query(query, size)`
Implement Insights page 2022-07-21 12:51:55 +00:00			`try:`
Implement meta search 2022-07-21 12:52:41 +00:00			`response = client.search(body=search_query, index=index)`
Implement choosing source in search form 2022-08-03 22:26:22 +00:00			`except RequestError as err:`
			`print("OpenSearch error", err)`
Implement Insights page 2022-07-21 12:51:55 +00:00			`return False`
			`filter_blacklisted(user, response)`
			`return response`


			`def query_results(request, size=None):`
			`"""`
			`API helper to alter the OpenSearch return format into something`
			`a bit better to parse.`
			`Accept a HTTP request object. Run the query, and annotate the`
			`results with the other data we have.`
			`"""`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`# is_anonymous = isinstance(request.user, AnonymousUser)`
Implement choosing source in search form 2022-08-03 22:26:22 +00:00			`message = None`
			`message_class = None`
			`add_bool = []`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`if request.user.is_anonymous:`
			`sizes = settings.OPENSEARCH_MAIN_SIZES_ANON`
Make search public and refine blacklisting 2022-08-02 21:22:22 +00:00			`else:`
			`sizes = settings.OPENSEARCH_MAIN_SIZES`
Implement Insights page 2022-07-21 12:51:55 +00:00			`if not size:`
			`if "size" in request.POST:`
			`size = request.POST["size"]`
Make search public and refine blacklisting 2022-08-02 21:22:22 +00:00			`if size not in sizes:`
Implement choosing source in search form 2022-08-03 22:26:22 +00:00			`message = "Size is not permitted"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
			`if "source" in request.POST:`
			`source = request.POST["source"]`
			`if source not in settings.OPENSEARCH_MAIN_SOURCES:`
			`message = "Invalid source"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`
			`if source != "all":`
			`add_bool.append({"src": source})`

			`if "check-sentiment" in request.POST:`
			`if "sentiment" in request.POST:`
			`sentiment = request.POST["sentiment"]`
			`try:`
			`sentiment = float(sentiment)`
			`except ValueError:`
			`message = "Sentiment is not a float"`
			`message_class = "danger"`
			`return {"message": message, "class": message_class}`

Implement Insights page 2022-07-21 12:51:55 +00:00			`if "query" in request.POST:`
			`query = request.POST["query"]`
Implement choosing source in search form 2022-08-03 22:26:22 +00:00			`search_query = construct_query(query, size)`
			`if add_bool:`
			`for item in add_bool:`
			`search_query["query"]["bool"]["must"].append({"match": item})`
Implement Insights page 2022-07-21 12:51:55 +00:00			`results = run_main_query(`
			`client,`
Improve redaction and anonymous user handling 2022-08-03 20:56:27 +00:00			`request.user, # passed through run_main_query to filter_blacklisted`
Implement choosing source in search form 2022-08-03 22:26:22 +00:00			`search_query,`
			`custom_query=True,`
Implement more elements on Insights page 2022-07-21 12:52:10 +00:00			`size=size,`
Implement Insights page 2022-07-21 12:51:55 +00:00			`)`
			`if not results:`
			`return False`
			`results_parsed = []`
			`if "hits" in results.keys():`
			`if "hits" in results["hits"]:`
			`for item in results["hits"]["hits"]:`
			`element = item["_source"]`
			`element["id"] = item["_id"]`

			`# Split the timestamp into date and time`
			`ts = element["ts"]`
			`ts_spl = ts.split("T")`
			`date = ts_spl[0]`
			`time = ts_spl[1]`
			`element["date"] = date`
			`element["time"] = time`
			`results_parsed.append(element)`

			`annotate_results(results_parsed)`

			`context = {`
			`"query": query,`
			`"results": results_parsed,`
			`"card": results["hits"]["total"]["value"],`
			`"took": results["took"],`
			`"redacted": results["redacted"],`
			`"exemption": results["exemption"],`
			`}`
			`return context`


			`def query_single_result(request):`
			`context = query_results(request, 1)`
			`dedup_set = {item["nick"] for item in context["results"]}`
Run meta on results from nicktrace in Insights 2022-07-21 12:52:48 +00:00			`if dedup_set:`
Implement Insights page 2022-07-21 12:51:55 +00:00			`context["item"] = context["results"][0]`
Run meta on results from nicktrace in Insights 2022-07-21 12:52:48 +00:00
Implement Insights page 2022-07-21 12:51:55 +00:00			`return (1, context)`


			`def construct_query(query, size):`
			`"""`
			`Accept some query parameters and construct an OpenSearch query.`
			`"""`
Improve context passing and implement superuser override for redactions 2022-07-21 12:49:32 +00:00			`if not size:`
			`size = 5`
Add opensearch library 2022-07-21 12:47:02 +00:00			`query = {`
Improve context passing and implement superuser override for redactions 2022-07-21 12:49:32 +00:00			`"size": size,`
Add opensearch library 2022-07-21 12:47:02 +00:00			`"query": {`
Implement choosing source in search form 2022-08-03 22:26:22 +00:00			`"bool": {`
			`"must": [`
			`{`
			`"query_string": {`
			`"query": query,`
			`# "fields": fields,`
			`# "default_field": "msg",`
			`# "type": "best_fields",`
			`"fuzziness": "AUTO",`
			`"fuzzy_transpositions": True,`
			`"fuzzy_max_expansions": 50,`
			`"fuzzy_prefix_length": 0,`
			`# "minimum_should_match": 1,`
			`"default_operator": "or",`
			`"analyzer": "standard",`
			`"lenient": True,`
			`"boost": 1,`
			`"allow_leading_wildcard": True,`
			`# "enable_position_increments": False,`
			`"phrase_slop": 3,`
			`# "max_determinized_states": 10000,`
			`"quote_field_suffix": "",`
			`"quote_analyzer": "standard",`
			`"analyze_wildcard": False,`
			`"auto_generate_synonyms_phrase_query": True,`
			`}`
			`}`
			`]`
Add opensearch library 2022-07-21 12:47:02 +00:00			`}`
			`},`
Sort results by date 2022-07-21 12:51:38 +00:00			`"sort": [`
			`{`
			`"ts": {`
			`"order": "desc",`
			`}`
			`}`
			`],`
Add opensearch library 2022-07-21 12:47:02 +00:00			`}`
			`return query`