From f0f7ceb8d14eedaf465858c4df9a1afa566682b6 Mon Sep 17 00:00:00 2001 From: Mark Veidemanis Date: Thu, 21 Jul 2022 13:52:34 +0100 Subject: [PATCH] Implement more efficient and accurate search algorithm --- app/local_settings.example.py | 5 ++ core/lib/nicktrace.py | 118 +++++++++++++++++++-------------- core/views/dynamic/insights.py | 4 +- 3 files changed, 78 insertions(+), 49 deletions(-) diff --git a/app/local_settings.example.py b/app/local_settings.example.py index 7e69036..38237a6 100644 --- a/app/local_settings.example.py +++ b/app/local_settings.example.py @@ -53,4 +53,9 @@ THRESHOLD_API_KEY = "name" THRESHOLD_API_TOKEN = "token" THRESHOLD_API_COUNTER = "counter" +# NickTrace +NICKTRACE_MAX_ITERATIONS = 4 +NICKTRACE_MAX_CHUNK_SIZE = 500 +NICKTRACE_QUERY_SIZE = 10000 + DEBUG = True diff --git a/core/lib/nicktrace.py b/core/lib/nicktrace.py index 173490e..af7e801 100644 --- a/core/lib/nicktrace.py +++ b/core/lib/nicktrace.py @@ -1,14 +1,20 @@ +from math import ceil + +from django.conf import settings +from numpy import array_split + from core.lib.opensearch import client, run_main_query -def get_nicks(request, net, nick, iter=0): - """ - Get all related nicknames of the given nickname by tracking nickname changes. - """ - print("GET NICKS INIT", net, nick, iter) +def construct_query(net, nicks): + # Construct the query + query_nicks = [{"match": {"nick": x}} for x in nicks] + query_users = [{"match": {"user": x}} for x in nicks] + query_should = query_nicks + query_users + # print("QUERY SHOULD", query_should) # Get the initial query query = { - "size": 10000, + "size": settings.NICKTRACE_QUERY_SIZE, "query": { "bool": { "must": [ @@ -16,55 +22,71 @@ def get_nicks(request, net, nick, iter=0): {"match": {"type": "nick"}}, { "bool": { - "should": [ - {"match": {"nick": nick}}, - {"match": {"user": nick}}, - ] + "should": query_should, } }, ] } }, } - results = run_main_query(client, request.user, query, custom_query=True) + return query + + +def get_nicks(request, net, nicks, iter=True): + """ + Get all related nicknames of the given nickname by tracking nickname changes. + """ + + # Split query into chunks + split_nicks = array_split( + nicks, ceil(len(nicks) / settings.NICKTRACE_MAX_CHUNK_SIZE) + ) nicks = [] - if "hits" in results.keys(): - if "hits" in results["hits"]: - for item in results["hits"]["hits"]: - element = item["_source"] - element["id"] = item["_id"] + for nicks_chunked in split_nicks: + if len(nicks_chunked) == 0: + break + query = construct_query(net, nicks_chunked) + results = run_main_query(client, request.user, query, custom_query=True) + if "hits" in results.keys(): + if "hits" in results["hits"]: + for item in results["hits"]["hits"]: + element = item["_source"] + element["id"] = item["_id"] - # Split the timestamp into date and time - ts = element["ts"] - ts_spl = ts.split("T") - date = ts_spl[0] - time = ts_spl[1] - element["date"] = date - element["time"] = time - if element["nick"] not in nicks: - nicks.append(element["nick"]) - if element["user"] not in nicks: - nicks.append(element["user"]) + # Split the timestamp into date and time + ts = element["ts"] + ts_spl = ts.split("T") + date = ts_spl[0] + time = ts_spl[1] + element["date"] = date + element["time"] = time + if element["nick"] not in nicks: + nicks.append(element["nick"]) + if element["user"] not in nicks: + nicks.append(element["user"]) + + # Run the search again, passing in all the users we found + + # Nicknames we find from the repeated search + nicks_searched = [] + if iter: + nicks_l2 = [] + loop = 0 + while loop < settings.NICKTRACE_MAX_ITERATIONS: + loop += 1 + nicks_not_searched = [x for x in nicks if x not in nicks_searched] + nicks_l2 = get_nicks(request, net, nicks, False) + + # Add all the nicks we just searched for to the list + for x in nicks_not_searched: + if x not in nicks_not_searched: + nicks_searched.append(x) + + # If all of the nicks we received now, we already know about + if set(nicks_l2).issubset(set(nicks)): + break + for x in nicks_l2: + if x not in nicks: + nicks.append(x) - # if iter < 2: - # iter += 1 - # collect_nicks = [] - # for x in nicks: - # nicks_2 = get_nicks(request, net, x, iter) - # print("NICKS_2", nicks_2) - # for y in nicks_2: - # if y not in collect_nicks: - # collect_nicks.append(y) - # print("RETURN NICKS", nick, collect_nicks) - # for x in collect_nicks: - # if x not in nicks: - # nicks.append(x) - # else: - # print("ABORTING SEARCH") return nicks - # results = set() - # nicks = query["nicks"] - # for nick in nicks: - # if nick not in results: - # nicks_result = get_nicks(request, net_nick) - # results.add(nick) diff --git a/core/views/dynamic/insights.py b/core/views/dynamic/insights.py index e41c4af..950d431 100644 --- a/core/views/dynamic/insights.py +++ b/core/views/dynamic/insights.py @@ -66,7 +66,9 @@ class InsightsNicks(LoginRequiredMixin, APIView): return HttpResponse("No nick") net = request.data["net"] nick = request.data["nick"] - nicks = get_nicks(request, net, nick) + nicks = get_nicks(request, net, [nick]) + # Filter Guest + nicks = [x for x in nicks if not x.startswith("Guest")] online = annotate_online(net, nicks) if not nicks: return HttpResponseForbidden()