Fix some Manticore queries

This commit is contained in:
Mark Veidemanis 2022-09-06 11:53:32 +01:00
parent 3b8735be72
commit 87324de666
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
6 changed files with 215 additions and 225 deletions

View File

@ -4,9 +4,9 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
extra_should = []
extra_should2 = []
if num:
extra_must.append({"match": {"num": num}})
extra_must.append({"equals": {"num": num}})
if net:
extra_must.append({"match": {"net": net}})
extra_must.append({"match_phrase": {"net": net}})
if channel:
extra_must.append({"match": {"channel": channel}})
if nicks:
@ -52,31 +52,36 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
extra_should.append({"match": {"nick": channel}})
else:
for ctype in types:
extra_should.append({"match": {"mtype": ctype}})
extra_should.append({"equals": {"mtype": ctype}})
else:
for ctype in types:
extra_should.append({"match": {"type": ctype}})
query = {
"size": size,
"index": index,
"limit": size,
"query": {
"bool": {
"must": [
{"match": {"src": src}},
{
"bool": {
"should": [*extra_should],
}
},
{
"bool": {
"should": [*extra_should2],
}
},
# {"equals": {"src": src}},
# {
# "bool": {
# "should": [*extra_should],
# }
# },
# {
# "bool": {
# "should": [*extra_should2],
# }
# },
*extra_must,
]
}
},
"fields": fields,
"_source": False,
# "_source": False,
}
if extra_should:
query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should]}})
if extra_should2:
query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should2]}})
return query

View File

@ -1,10 +1,13 @@
from re import search
from django.conf import settings
from core.lib.opensearch import annotate_results, filter_blacklisted, parse_results
import manticoresearch
from core.views.helpers import dedup_list
from datetime import datetime
from pprint import pprint
import manticoresearch
from django.conf import settings
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
from core.views.helpers import dedup_list
def initialise_manticore():
"""
Initialise the Manticore client
@ -15,8 +18,10 @@ def initialise_manticore():
return (api_client, api_instance)
api_client, client = initialise_manticore()
def construct_query(query, size, index, blank=False):
"""
Accept some query parameters and construct an OpenSearch query.
@ -35,12 +40,14 @@ def construct_query(query, size, index, blank=False):
query_base["query"]["bool"]["must"].append(query_string)
return query_base
def run_query(client, user, search_query):
response = client.search(search_query)
response = response.to_dict()
filter_blacklisted(user, response)
return response
def query_results(
request,
query_params,
@ -110,6 +117,9 @@ def query_results(
query = query_params["query"]
search_query = construct_query(query, size, index)
query_created = True
else:
if custom_query:
search_query = custom_query
if tags:
# Get a blank search query
@ -159,13 +169,16 @@ def query_results(
add_top.append(add_top_tmp)
print("AFTER", add_top)
# Date/time range
if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
query_params.keys()
):
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")
to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")
from_ts = int(from_ts.timestamp())
to_ts = int(to_ts.timestamp())
range_query = {
"range": {
"ts": {
@ -247,7 +260,6 @@ def query_results(
if sort:
search_query["sort"] = sort
pprint(search_query)
results = run_query(
client,
@ -256,7 +268,7 @@ def query_results(
)
if not results:
return False
#results = results.to_dict()
# results = results.to_dict()
results_parsed = parse_results(results)
if annotate:
annotate_results(results_parsed)
@ -280,4 +292,5 @@ def query_results(
"card": results["hits"]["total"],
"took": results["took"],
}
return context
print("RTRN", context)
return context

View File

@ -5,11 +5,10 @@ from django.conf import settings
from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError, RequestError
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
from core.views.helpers import dedup_list
from datetime import datetime
# from json import dumps
# pp = lambda x: print(dumps(x, indent=2))
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
from core.views.helpers import dedup_list
def initialise_opensearch():
@ -37,114 +36,6 @@ def initialise_opensearch():
client = initialise_opensearch()
def annotate_results(results_parsed):
"""
Accept a list of dict objects, search for the number of channels and users.
Add them to the object.
Mutate it in place. Does not return anything.
"""
# Figure out items with net (not discord)
nets = set()
for x in results_parsed:
if "net" in x:
nets.add(x["net"])
for net in nets:
# Annotate the online attribute from Threshold
nicks = list(
set(
[
x["nick"]
for x in results_parsed
if {"nick", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
channels = list(
set(
[
x["channel"]
for x in results_parsed
if {"channel", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
online_info = annotate_online(net, nicks)
# Annotate the number of users in the channel
num_users = annotate_num_users(net, channels)
# Annotate the number channels the user is on
num_chans = annotate_num_chans(net, nicks)
for item in results_parsed:
if "net" in item:
if item["net"] == net:
if "nick" in item:
if item["nick"] in online_info:
item["online"] = online_info[item["nick"]]
if "channel" in item:
if item["channel"] in num_users:
item["num_users"] = num_users[item["channel"]]
if "nick" in item:
if item["nick"] in num_chans:
item["num_chans"] = num_chans[item["nick"]]
def filter_blacklisted(user, response):
"""
Low level filter to take the raw OpenSearch response and remove
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
if blacklisted_type in item[data_index].keys():
content = item[data_index][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item == str(content):
# Remove the item
if item in response["hits"]["hits"]:
# Let the UI know something was redacted
if (
"exemption"
not in response["hits"]["hits"][index][data_index]
):
response["redacted"] += 1
# Anonymous
if user.is_anonymous:
# Just set it to none so the index is not off
response["hits"]["hits"][index] = None
else:
if not user.has_perm("core.bypass_blacklist"):
response["hits"]["hits"][index] = None
else:
response["hits"]["hits"][index][data_index][
"exemption"
] = True
# Actually get rid of all the things we set to None
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
def construct_query(query, size, use_query_string=True, tokens=False):
"""
Accept some query parameters and construct an OpenSearch query.
@ -233,54 +124,6 @@ def run_main_query(client, user, query, custom_query=False, index=None, size=Non
return response
def parse_results(results):
results_parsed = []
stringify = ["host", "channel"]
if "hits" in results.keys():
if "hits" in results["hits"]:
for item in results["hits"]["hits"]:
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
element = item[data_index]
for field in stringify:
if field in element:
element[field] = str(element[field])
# Why are fields in lists...
if data_index == "fields":
element = {k: v[0] for k, v in element.items() if len(v)}
element["id"] = item["_id"]
# Split the timestamp into date and time
if "ts" not in element:
if "time" in element: # will fix data later
ts = element["time"]
del element["time"]
element["ts"] = ts
if "ts" in element:
if isinstance(element["ts"], str):
ts = element["ts"]
else:
ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
ts_spl = ts.split("T")
date = ts_spl[0]
time = ts_spl[1]
element["date"] = date
if "." in time:
time_spl = time.split(".")
if len(time_spl) == 2:
element["time"] = time.split(".")[0]
else:
element["time"] = time
else:
element["time"] = time
results_parsed.append(element)
return results_parsed
def query_results(
request,
query_params,

164
core/lib/processing.py Normal file
View File

@ -0,0 +1,164 @@
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
from django.conf import settings
from datetime import datetime
def annotate_results(results_parsed):
"""
Accept a list of dict objects, search for the number of channels and users.
Add them to the object.
Mutate it in place. Does not return anything.
"""
# Figure out items with net (not discord)
nets = set()
for x in results_parsed:
if "net" in x:
nets.add(x["net"])
for net in nets:
# Annotate the online attribute from Threshold
nicks = list(
set(
[
x["nick"]
for x in results_parsed
if {"nick", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
channels = list(
set(
[
x["channel"]
for x in results_parsed
if {"channel", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
online_info = annotate_online(net, nicks)
# Annotate the number of users in the channel
num_users = annotate_num_users(net, channels)
# Annotate the number channels the user is on
num_chans = annotate_num_chans(net, nicks)
for item in results_parsed:
if "net" in item:
if item["net"] == net:
if "nick" in item:
if item["nick"] in online_info:
item["online"] = online_info[item["nick"]]
if "channel" in item:
if item["channel"] in num_users:
item["num_users"] = num_users[item["channel"]]
if "nick" in item:
if item["nick"] in num_chans:
item["num_chans"] = num_chans[item["nick"]]
def filter_blacklisted(user, response):
"""
Low level filter to take the raw OpenSearch response and remove
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
if blacklisted_type in item[data_index].keys():
content = item[data_index][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item == str(content):
# Remove the item
if item in response["hits"]["hits"]:
# Let the UI know something was redacted
if (
"exemption"
not in response["hits"]["hits"][index][data_index]
):
response["redacted"] += 1
# Anonymous
if user.is_anonymous:
# Just set it to none so the index is not off
response["hits"]["hits"][index] = None
else:
if not user.has_perm("core.bypass_blacklist"):
response["hits"]["hits"][index] = None
else:
response["hits"]["hits"][index][data_index][
"exemption"
] = True
# Actually get rid of all the things we set to None
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
def parse_results(results):
results_parsed = []
stringify = ["host", "channel"]
if "hits" in results.keys():
if "hits" in results["hits"]:
for item in results["hits"]["hits"]:
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
element = item[data_index]
for field in stringify:
if field in element:
element[field] = str(element[field])
# Why are fields in lists...
if data_index == "fields":
element = {k: v[0] for k, v in element.items() if len(v)}
element["id"] = item["_id"]
# Remove empty values
for field in list(element.keys()):
if element[field] == "":
del element[field]
# Split the timestamp into date and time
if "ts" not in element:
if "time" in element: # will fix data later
ts = element["time"]
del element["time"]
element["ts"] = ts
if "ts" in element:
if isinstance(element["ts"], str):
ts = element["ts"]
else:
ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
ts_spl = ts.split("T")
date = ts_spl[0]
time = ts_spl[1]
element["date"] = date
if "." in time:
time_spl = time.split(".")
if len(time_spl) == 2:
element["time"] = time.split(".")[0]
else:
element["time"] = time
else:
element["time"] = time
results_parsed.append(element)
return results_parsed

View File

@ -17,7 +17,7 @@
value="{{ params.query }}"
class="input"
type="text"
placeholder="Token search: (science | tech | art) + (interest) -hello">
placeholder="Search something">
<span class="icon is-small is-left">
<i class="fas fa-magnifying-glass"></i>
</span>
@ -76,28 +76,6 @@
</a>
</p>
</div>
<div class="control">
<div class="field rounded-tooltip">
<input
id="full_query"
type="checkbox"
class="switch is-rounded is-info"
{% if params.query_full is not None %}checked="checked"{% else %}none{% endif %}
{% if False %}
{# what are you looking at? #}
disabled
{% endif %}
data-script="on click toggle .is-hidden on #query_full">
<label
for="full_query">
Full query
</label>
{% if False %}
{# what are you looking at? #}
<span class="tooltiptext tag is-danger is-light">No access</span>
{% endif %}
</div>
</div>
</div>
<div class="column is-narrow">
<div class="field has-addons block">
@ -406,23 +384,6 @@
</div>
</div>
</div>
<div id="query_full" class="block {% if params.query_full is None %}is-hidden{% endif %}">
<div class="control is-expanded has-icons-left">
<input
hx-post="{% url 'search' %}"
hx-trigger="keyup changed delay:200ms"
hx-target="#results"
hx-swap="innerHTML"
name="query_full"
value="{{ params.query_full }}"
class="input"
type="text"
placeholder="Full query: msg: science AND src: 4ch AND channel: 100293">
<span class="icon is-small is-left">
<i class="fas fa-magnifying-glass"></i>
</span>
</div>
</div>
<div class="block">
<input
hx-trigger="change"

View File

@ -12,7 +12,8 @@ from rest_framework.parsers import FormParser
from rest_framework.views import APIView
from core.lib.context import construct_query
#from core.lib.opensearch import query_results
# from core.lib.opensearch import query_results
from core.lib.manticore import query_results
from core.lib.threshold import (
annotate_num_chans,
@ -367,6 +368,8 @@ class DrilldownContextModal(APIView):
if query_params["type"] not in ["znc", "auth"]:
annotate = True
# Create the query with the context helper
if query_params["num"].isdigit():
query_params["num"] = int(query_params["num"])
search_query = construct_query(
query_params["index"],
query_params["net"],
@ -403,6 +406,7 @@ class DrilldownContextModal(APIView):
# for index, item in enumerate(results["object_list"]):
# results["object_list"][index]["time"] = item["time"]+"SSS"
unique = str(uuid.uuid4())[:8]
print("PARAMS", query_params)
context = {
"net": query_params["net"],
"channel": query_params["channel"],