From 87324de66605ead283ee96e240c031df91979435 Mon Sep 17 00:00:00 2001
From: Mark Veidemanis
Date: Tue, 6 Sep 2022 11:53:32 +0100
Subject: [PATCH] Fix some Manticore queries
---
core/lib/context.py | 37 ++--
core/lib/manticore.py | 31 +++-
core/lib/opensearch.py | 161 +----------------
core/lib/processing.py | 164 ++++++++++++++++++
.../ui/drilldown/search_partial.html | 41 +----
core/views/ui/drilldown.py | 6 +-
6 files changed, 215 insertions(+), 225 deletions(-)
create mode 100644 core/lib/processing.py
diff --git a/core/lib/context.py b/core/lib/context.py
index 40b560d..ccff818 100644
--- a/core/lib/context.py
+++ b/core/lib/context.py
@@ -4,9 +4,9 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
extra_should = []
extra_should2 = []
if num:
- extra_must.append({"match": {"num": num}})
+ extra_must.append({"equals": {"num": num}})
if net:
- extra_must.append({"match": {"net": net}})
+ extra_must.append({"match_phrase": {"net": net}})
if channel:
extra_must.append({"match": {"channel": channel}})
if nicks:
@@ -52,31 +52,36 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
extra_should.append({"match": {"nick": channel}})
else:
for ctype in types:
- extra_should.append({"match": {"mtype": ctype}})
+ extra_should.append({"equals": {"mtype": ctype}})
else:
for ctype in types:
extra_should.append({"match": {"type": ctype}})
query = {
- "size": size,
+ "index": index,
+ "limit": size,
"query": {
"bool": {
"must": [
- {"match": {"src": src}},
- {
- "bool": {
- "should": [*extra_should],
- }
- },
- {
- "bool": {
- "should": [*extra_should2],
- }
- },
+ # {"equals": {"src": src}},
+ # {
+ # "bool": {
+ # "should": [*extra_should],
+ # }
+ # },
+ # {
+ # "bool": {
+ # "should": [*extra_should2],
+ # }
+ # },
*extra_must,
]
}
},
"fields": fields,
- "_source": False,
+ # "_source": False,
}
+ if extra_should:
+ query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should]}})
+ if extra_should2:
+ query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should2]}})
return query
diff --git a/core/lib/manticore.py b/core/lib/manticore.py
index 6d2a13a..9d56104 100644
--- a/core/lib/manticore.py
+++ b/core/lib/manticore.py
@@ -1,10 +1,13 @@
-from re import search
-from django.conf import settings
-from core.lib.opensearch import annotate_results, filter_blacklisted, parse_results
-import manticoresearch
-from core.views.helpers import dedup_list
+from datetime import datetime
from pprint import pprint
+import manticoresearch
+from django.conf import settings
+
+from core.lib.processing import annotate_results, filter_blacklisted, parse_results
+from core.views.helpers import dedup_list
+
+
def initialise_manticore():
"""
Initialise the Manticore client
@@ -15,8 +18,10 @@ def initialise_manticore():
return (api_client, api_instance)
+
api_client, client = initialise_manticore()
+
def construct_query(query, size, index, blank=False):
"""
Accept some query parameters and construct an OpenSearch query.
@@ -35,12 +40,14 @@ def construct_query(query, size, index, blank=False):
query_base["query"]["bool"]["must"].append(query_string)
return query_base
+
def run_query(client, user, search_query):
response = client.search(search_query)
response = response.to_dict()
filter_blacklisted(user, response)
return response
+
def query_results(
request,
query_params,
@@ -110,6 +117,9 @@ def query_results(
query = query_params["query"]
search_query = construct_query(query, size, index)
query_created = True
+ else:
+ if custom_query:
+ search_query = custom_query
if tags:
# Get a blank search query
@@ -159,13 +169,16 @@ def query_results(
add_top.append(add_top_tmp)
print("AFTER", add_top)
-
# Date/time range
if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
query_params.keys()
):
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
+ from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")
+ to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")
+ from_ts = int(from_ts.timestamp())
+ to_ts = int(to_ts.timestamp())
range_query = {
"range": {
"ts": {
@@ -247,7 +260,6 @@ def query_results(
if sort:
search_query["sort"] = sort
-
pprint(search_query)
results = run_query(
client,
@@ -256,7 +268,7 @@ def query_results(
)
if not results:
return False
- #results = results.to_dict()
+ # results = results.to_dict()
results_parsed = parse_results(results)
if annotate:
annotate_results(results_parsed)
@@ -280,4 +292,5 @@ def query_results(
"card": results["hits"]["total"],
"took": results["took"],
}
- return context
\ No newline at end of file
+ print("RTRN", context)
+ return context
diff --git a/core/lib/opensearch.py b/core/lib/opensearch.py
index 6a1c5a4..032cb1a 100644
--- a/core/lib/opensearch.py
+++ b/core/lib/opensearch.py
@@ -5,11 +5,10 @@ from django.conf import settings
from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError, RequestError
-from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
-from core.views.helpers import dedup_list
-from datetime import datetime
# from json import dumps
# pp = lambda x: print(dumps(x, indent=2))
+from core.lib.processing import annotate_results, filter_blacklisted, parse_results
+from core.views.helpers import dedup_list
def initialise_opensearch():
@@ -37,114 +36,6 @@ def initialise_opensearch():
client = initialise_opensearch()
-def annotate_results(results_parsed):
- """
- Accept a list of dict objects, search for the number of channels and users.
- Add them to the object.
- Mutate it in place. Does not return anything.
- """
- # Figure out items with net (not discord)
- nets = set()
- for x in results_parsed:
- if "net" in x:
- nets.add(x["net"])
-
- for net in nets:
- # Annotate the online attribute from Threshold
- nicks = list(
- set(
- [
- x["nick"]
- for x in results_parsed
- if {"nick", "src", "net"}.issubset(x)
- and x["src"] == "irc"
- and x["net"] == net
- ]
- )
- )
- channels = list(
- set(
- [
- x["channel"]
- for x in results_parsed
- if {"channel", "src", "net"}.issubset(x)
- and x["src"] == "irc"
- and x["net"] == net
- ]
- )
- )
- online_info = annotate_online(net, nicks)
- # Annotate the number of users in the channel
- num_users = annotate_num_users(net, channels)
- # Annotate the number channels the user is on
- num_chans = annotate_num_chans(net, nicks)
- for item in results_parsed:
- if "net" in item:
- if item["net"] == net:
- if "nick" in item:
- if item["nick"] in online_info:
- item["online"] = online_info[item["nick"]]
- if "channel" in item:
- if item["channel"] in num_users:
- item["num_users"] = num_users[item["channel"]]
- if "nick" in item:
- if item["nick"] in num_chans:
- item["num_chans"] = num_chans[item["nick"]]
-
-
-def filter_blacklisted(user, response):
- """
- Low level filter to take the raw OpenSearch response and remove
- objects from it we want to keep secret.
- Does not return, the object is mutated in place.
- """
- response["redacted"] = 0
- response["exemption"] = None
- if user.is_superuser:
- response["exemption"] = True
- # is_anonymous = isinstance(user, AnonymousUser)
- # For every hit from ES
- for index, item in enumerate(list(response["hits"]["hits"])):
- # For every blacklisted type
- for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
- # Check this field we are matching exists
- if "_source" in item.keys():
- data_index = "_source"
- elif "fields" in item.keys():
- data_index = "fields"
- else:
- return False
- if blacklisted_type in item[data_index].keys():
- content = item[data_index][blacklisted_type]
- # For every item in the blacklisted array for the type
- for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
- blacklisted_type
- ]:
- if blacklisted_item == str(content):
- # Remove the item
- if item in response["hits"]["hits"]:
- # Let the UI know something was redacted
- if (
- "exemption"
- not in response["hits"]["hits"][index][data_index]
- ):
- response["redacted"] += 1
- # Anonymous
- if user.is_anonymous:
- # Just set it to none so the index is not off
- response["hits"]["hits"][index] = None
- else:
- if not user.has_perm("core.bypass_blacklist"):
- response["hits"]["hits"][index] = None
- else:
- response["hits"]["hits"][index][data_index][
- "exemption"
- ] = True
-
- # Actually get rid of all the things we set to None
- response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
-
-
def construct_query(query, size, use_query_string=True, tokens=False):
"""
Accept some query parameters and construct an OpenSearch query.
@@ -233,54 +124,6 @@ def run_main_query(client, user, query, custom_query=False, index=None, size=Non
return response
-def parse_results(results):
- results_parsed = []
- stringify = ["host", "channel"]
- if "hits" in results.keys():
- if "hits" in results["hits"]:
- for item in results["hits"]["hits"]:
- if "_source" in item.keys():
- data_index = "_source"
- elif "fields" in item.keys():
- data_index = "fields"
- else:
- return False
- element = item[data_index]
- for field in stringify:
- if field in element:
- element[field] = str(element[field])
- # Why are fields in lists...
- if data_index == "fields":
- element = {k: v[0] for k, v in element.items() if len(v)}
- element["id"] = item["_id"]
-
- # Split the timestamp into date and time
- if "ts" not in element:
- if "time" in element: # will fix data later
- ts = element["time"]
- del element["time"]
- element["ts"] = ts
- if "ts" in element:
- if isinstance(element["ts"], str):
- ts = element["ts"]
- else:
- ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
- ts_spl = ts.split("T")
- date = ts_spl[0]
- time = ts_spl[1]
- element["date"] = date
- if "." in time:
- time_spl = time.split(".")
- if len(time_spl) == 2:
- element["time"] = time.split(".")[0]
- else:
- element["time"] = time
- else:
- element["time"] = time
- results_parsed.append(element)
- return results_parsed
-
-
def query_results(
request,
query_params,
diff --git a/core/lib/processing.py b/core/lib/processing.py
new file mode 100644
index 0000000..d3607dc
--- /dev/null
+++ b/core/lib/processing.py
@@ -0,0 +1,164 @@
+from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
+from django.conf import settings
+from datetime import datetime
+
+
+def annotate_results(results_parsed):
+ """
+ Accept a list of dict objects, search for the number of channels and users.
+ Add them to the object.
+ Mutate it in place. Does not return anything.
+ """
+ # Figure out items with net (not discord)
+ nets = set()
+ for x in results_parsed:
+ if "net" in x:
+ nets.add(x["net"])
+
+ for net in nets:
+ # Annotate the online attribute from Threshold
+ nicks = list(
+ set(
+ [
+ x["nick"]
+ for x in results_parsed
+ if {"nick", "src", "net"}.issubset(x)
+ and x["src"] == "irc"
+ and x["net"] == net
+ ]
+ )
+ )
+ channels = list(
+ set(
+ [
+ x["channel"]
+ for x in results_parsed
+ if {"channel", "src", "net"}.issubset(x)
+ and x["src"] == "irc"
+ and x["net"] == net
+ ]
+ )
+ )
+ online_info = annotate_online(net, nicks)
+ # Annotate the number of users in the channel
+ num_users = annotate_num_users(net, channels)
+ # Annotate the number channels the user is on
+ num_chans = annotate_num_chans(net, nicks)
+ for item in results_parsed:
+ if "net" in item:
+ if item["net"] == net:
+ if "nick" in item:
+ if item["nick"] in online_info:
+ item["online"] = online_info[item["nick"]]
+ if "channel" in item:
+ if item["channel"] in num_users:
+ item["num_users"] = num_users[item["channel"]]
+ if "nick" in item:
+ if item["nick"] in num_chans:
+ item["num_chans"] = num_chans[item["nick"]]
+
+
+def filter_blacklisted(user, response):
+ """
+ Low level filter to take the raw OpenSearch response and remove
+ objects from it we want to keep secret.
+ Does not return, the object is mutated in place.
+ """
+ response["redacted"] = 0
+ response["exemption"] = None
+ if user.is_superuser:
+ response["exemption"] = True
+ # is_anonymous = isinstance(user, AnonymousUser)
+ # For every hit from ES
+ for index, item in enumerate(list(response["hits"]["hits"])):
+ # For every blacklisted type
+ for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
+ # Check this field we are matching exists
+ if "_source" in item.keys():
+ data_index = "_source"
+ elif "fields" in item.keys():
+ data_index = "fields"
+ else:
+ return False
+ if blacklisted_type in item[data_index].keys():
+ content = item[data_index][blacklisted_type]
+ # For every item in the blacklisted array for the type
+ for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
+ blacklisted_type
+ ]:
+ if blacklisted_item == str(content):
+ # Remove the item
+ if item in response["hits"]["hits"]:
+ # Let the UI know something was redacted
+ if (
+ "exemption"
+ not in response["hits"]["hits"][index][data_index]
+ ):
+ response["redacted"] += 1
+ # Anonymous
+ if user.is_anonymous:
+ # Just set it to none so the index is not off
+ response["hits"]["hits"][index] = None
+ else:
+ if not user.has_perm("core.bypass_blacklist"):
+ response["hits"]["hits"][index] = None
+ else:
+ response["hits"]["hits"][index][data_index][
+ "exemption"
+ ] = True
+
+ # Actually get rid of all the things we set to None
+ response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
+
+
+def parse_results(results):
+ results_parsed = []
+ stringify = ["host", "channel"]
+ if "hits" in results.keys():
+ if "hits" in results["hits"]:
+ for item in results["hits"]["hits"]:
+ if "_source" in item.keys():
+ data_index = "_source"
+ elif "fields" in item.keys():
+ data_index = "fields"
+ else:
+ return False
+ element = item[data_index]
+ for field in stringify:
+ if field in element:
+ element[field] = str(element[field])
+ # Why are fields in lists...
+ if data_index == "fields":
+ element = {k: v[0] for k, v in element.items() if len(v)}
+ element["id"] = item["_id"]
+
+ # Remove empty values
+ for field in list(element.keys()):
+ if element[field] == "":
+ del element[field]
+
+ # Split the timestamp into date and time
+ if "ts" not in element:
+ if "time" in element: # will fix data later
+ ts = element["time"]
+ del element["time"]
+ element["ts"] = ts
+ if "ts" in element:
+ if isinstance(element["ts"], str):
+ ts = element["ts"]
+ else:
+ ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
+ ts_spl = ts.split("T")
+ date = ts_spl[0]
+ time = ts_spl[1]
+ element["date"] = date
+ if "." in time:
+ time_spl = time.split(".")
+ if len(time_spl) == 2:
+ element["time"] = time.split(".")[0]
+ else:
+ element["time"] = time
+ else:
+ element["time"] = time
+ results_parsed.append(element)
+ return results_parsed
\ No newline at end of file
diff --git a/core/templates/ui/drilldown/search_partial.html b/core/templates/ui/drilldown/search_partial.html
index e67bec5..a6da2fd 100644
--- a/core/templates/ui/drilldown/search_partial.html
+++ b/core/templates/ui/drilldown/search_partial.html
@@ -17,7 +17,7 @@
value="{{ params.query }}"
class="input"
type="text"
- placeholder="Token search: (science | tech | art) + (interest) -hello">
+ placeholder="Search something">
@@ -76,28 +76,6 @@
-
-
-
-
- {% if False %}
- {# what are you looking at? #}
- No access
- {% endif %}
-