Fix some Manticore queries

This commit is contained in:
Mark Veidemanis 2022-09-06 11:53:32 +01:00
parent 3b8735be72
commit 87324de666
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
6 changed files with 215 additions and 225 deletions

View File

@ -4,9 +4,9 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
extra_should = [] extra_should = []
extra_should2 = [] extra_should2 = []
if num: if num:
extra_must.append({"match": {"num": num}}) extra_must.append({"equals": {"num": num}})
if net: if net:
extra_must.append({"match": {"net": net}}) extra_must.append({"match_phrase": {"net": net}})
if channel: if channel:
extra_must.append({"match": {"channel": channel}}) extra_must.append({"match": {"channel": channel}})
if nicks: if nicks:
@ -52,31 +52,36 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
extra_should.append({"match": {"nick": channel}}) extra_should.append({"match": {"nick": channel}})
else: else:
for ctype in types: for ctype in types:
extra_should.append({"match": {"mtype": ctype}}) extra_should.append({"equals": {"mtype": ctype}})
else: else:
for ctype in types: for ctype in types:
extra_should.append({"match": {"type": ctype}}) extra_should.append({"match": {"type": ctype}})
query = { query = {
"size": size, "index": index,
"limit": size,
"query": { "query": {
"bool": { "bool": {
"must": [ "must": [
{"match": {"src": src}}, # {"equals": {"src": src}},
{ # {
"bool": { # "bool": {
"should": [*extra_should], # "should": [*extra_should],
} # }
}, # },
{ # {
"bool": { # "bool": {
"should": [*extra_should2], # "should": [*extra_should2],
} # }
}, # },
*extra_must, *extra_must,
] ]
} }
}, },
"fields": fields, "fields": fields,
"_source": False, # "_source": False,
} }
if extra_should:
query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should]}})
if extra_should2:
query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should2]}})
return query return query

View File

@ -1,10 +1,13 @@
from re import search from datetime import datetime
from django.conf import settings
from core.lib.opensearch import annotate_results, filter_blacklisted, parse_results
import manticoresearch
from core.views.helpers import dedup_list
from pprint import pprint from pprint import pprint
import manticoresearch
from django.conf import settings
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
from core.views.helpers import dedup_list
def initialise_manticore(): def initialise_manticore():
""" """
Initialise the Manticore client Initialise the Manticore client
@ -15,8 +18,10 @@ def initialise_manticore():
return (api_client, api_instance) return (api_client, api_instance)
api_client, client = initialise_manticore() api_client, client = initialise_manticore()
def construct_query(query, size, index, blank=False): def construct_query(query, size, index, blank=False):
""" """
Accept some query parameters and construct an OpenSearch query. Accept some query parameters and construct an OpenSearch query.
@ -35,12 +40,14 @@ def construct_query(query, size, index, blank=False):
query_base["query"]["bool"]["must"].append(query_string) query_base["query"]["bool"]["must"].append(query_string)
return query_base return query_base
def run_query(client, user, search_query): def run_query(client, user, search_query):
response = client.search(search_query) response = client.search(search_query)
response = response.to_dict() response = response.to_dict()
filter_blacklisted(user, response) filter_blacklisted(user, response)
return response return response
def query_results( def query_results(
request, request,
query_params, query_params,
@ -110,6 +117,9 @@ def query_results(
query = query_params["query"] query = query_params["query"]
search_query = construct_query(query, size, index) search_query = construct_query(query, size, index)
query_created = True query_created = True
else:
if custom_query:
search_query = custom_query
if tags: if tags:
# Get a blank search query # Get a blank search query
@ -159,13 +169,16 @@ def query_results(
add_top.append(add_top_tmp) add_top.append(add_top_tmp)
print("AFTER", add_top) print("AFTER", add_top)
# Date/time range # Date/time range
if set({"from_date", "to_date", "from_time", "to_time"}).issubset( if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
query_params.keys() query_params.keys()
): ):
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z" from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z" to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")
to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")
from_ts = int(from_ts.timestamp())
to_ts = int(to_ts.timestamp())
range_query = { range_query = {
"range": { "range": {
"ts": { "ts": {
@ -247,7 +260,6 @@ def query_results(
if sort: if sort:
search_query["sort"] = sort search_query["sort"] = sort
pprint(search_query) pprint(search_query)
results = run_query( results = run_query(
client, client,
@ -256,7 +268,7 @@ def query_results(
) )
if not results: if not results:
return False return False
#results = results.to_dict() # results = results.to_dict()
results_parsed = parse_results(results) results_parsed = parse_results(results)
if annotate: if annotate:
annotate_results(results_parsed) annotate_results(results_parsed)
@ -280,4 +292,5 @@ def query_results(
"card": results["hits"]["total"], "card": results["hits"]["total"],
"took": results["took"], "took": results["took"],
} }
return context print("RTRN", context)
return context

View File

@ -5,11 +5,10 @@ from django.conf import settings
from opensearchpy import OpenSearch from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError, RequestError from opensearchpy.exceptions import NotFoundError, RequestError
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
from core.views.helpers import dedup_list
from datetime import datetime
# from json import dumps # from json import dumps
# pp = lambda x: print(dumps(x, indent=2)) # pp = lambda x: print(dumps(x, indent=2))
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
from core.views.helpers import dedup_list
def initialise_opensearch(): def initialise_opensearch():
@ -37,114 +36,6 @@ def initialise_opensearch():
client = initialise_opensearch() client = initialise_opensearch()
def annotate_results(results_parsed):
"""
Accept a list of dict objects, search for the number of channels and users.
Add them to the object.
Mutate it in place. Does not return anything.
"""
# Figure out items with net (not discord)
nets = set()
for x in results_parsed:
if "net" in x:
nets.add(x["net"])
for net in nets:
# Annotate the online attribute from Threshold
nicks = list(
set(
[
x["nick"]
for x in results_parsed
if {"nick", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
channels = list(
set(
[
x["channel"]
for x in results_parsed
if {"channel", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
online_info = annotate_online(net, nicks)
# Annotate the number of users in the channel
num_users = annotate_num_users(net, channels)
# Annotate the number channels the user is on
num_chans = annotate_num_chans(net, nicks)
for item in results_parsed:
if "net" in item:
if item["net"] == net:
if "nick" in item:
if item["nick"] in online_info:
item["online"] = online_info[item["nick"]]
if "channel" in item:
if item["channel"] in num_users:
item["num_users"] = num_users[item["channel"]]
if "nick" in item:
if item["nick"] in num_chans:
item["num_chans"] = num_chans[item["nick"]]
def filter_blacklisted(user, response):
"""
Low level filter to take the raw OpenSearch response and remove
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
if blacklisted_type in item[data_index].keys():
content = item[data_index][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item == str(content):
# Remove the item
if item in response["hits"]["hits"]:
# Let the UI know something was redacted
if (
"exemption"
not in response["hits"]["hits"][index][data_index]
):
response["redacted"] += 1
# Anonymous
if user.is_anonymous:
# Just set it to none so the index is not off
response["hits"]["hits"][index] = None
else:
if not user.has_perm("core.bypass_blacklist"):
response["hits"]["hits"][index] = None
else:
response["hits"]["hits"][index][data_index][
"exemption"
] = True
# Actually get rid of all the things we set to None
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
def construct_query(query, size, use_query_string=True, tokens=False): def construct_query(query, size, use_query_string=True, tokens=False):
""" """
Accept some query parameters and construct an OpenSearch query. Accept some query parameters and construct an OpenSearch query.
@ -233,54 +124,6 @@ def run_main_query(client, user, query, custom_query=False, index=None, size=Non
return response return response
def parse_results(results):
results_parsed = []
stringify = ["host", "channel"]
if "hits" in results.keys():
if "hits" in results["hits"]:
for item in results["hits"]["hits"]:
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
element = item[data_index]
for field in stringify:
if field in element:
element[field] = str(element[field])
# Why are fields in lists...
if data_index == "fields":
element = {k: v[0] for k, v in element.items() if len(v)}
element["id"] = item["_id"]
# Split the timestamp into date and time
if "ts" not in element:
if "time" in element: # will fix data later
ts = element["time"]
del element["time"]
element["ts"] = ts
if "ts" in element:
if isinstance(element["ts"], str):
ts = element["ts"]
else:
ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
ts_spl = ts.split("T")
date = ts_spl[0]
time = ts_spl[1]
element["date"] = date
if "." in time:
time_spl = time.split(".")
if len(time_spl) == 2:
element["time"] = time.split(".")[0]
else:
element["time"] = time
else:
element["time"] = time
results_parsed.append(element)
return results_parsed
def query_results( def query_results(
request, request,
query_params, query_params,

164
core/lib/processing.py Normal file
View File

@ -0,0 +1,164 @@
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
from django.conf import settings
from datetime import datetime
def annotate_results(results_parsed):
"""
Accept a list of dict objects, search for the number of channels and users.
Add them to the object.
Mutate it in place. Does not return anything.
"""
# Figure out items with net (not discord)
nets = set()
for x in results_parsed:
if "net" in x:
nets.add(x["net"])
for net in nets:
# Annotate the online attribute from Threshold
nicks = list(
set(
[
x["nick"]
for x in results_parsed
if {"nick", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
channels = list(
set(
[
x["channel"]
for x in results_parsed
if {"channel", "src", "net"}.issubset(x)
and x["src"] == "irc"
and x["net"] == net
]
)
)
online_info = annotate_online(net, nicks)
# Annotate the number of users in the channel
num_users = annotate_num_users(net, channels)
# Annotate the number channels the user is on
num_chans = annotate_num_chans(net, nicks)
for item in results_parsed:
if "net" in item:
if item["net"] == net:
if "nick" in item:
if item["nick"] in online_info:
item["online"] = online_info[item["nick"]]
if "channel" in item:
if item["channel"] in num_users:
item["num_users"] = num_users[item["channel"]]
if "nick" in item:
if item["nick"] in num_chans:
item["num_chans"] = num_chans[item["nick"]]
def filter_blacklisted(user, response):
"""
Low level filter to take the raw OpenSearch response and remove
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
if blacklisted_type in item[data_index].keys():
content = item[data_index][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item == str(content):
# Remove the item
if item in response["hits"]["hits"]:
# Let the UI know something was redacted
if (
"exemption"
not in response["hits"]["hits"][index][data_index]
):
response["redacted"] += 1
# Anonymous
if user.is_anonymous:
# Just set it to none so the index is not off
response["hits"]["hits"][index] = None
else:
if not user.has_perm("core.bypass_blacklist"):
response["hits"]["hits"][index] = None
else:
response["hits"]["hits"][index][data_index][
"exemption"
] = True
# Actually get rid of all the things we set to None
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
def parse_results(results):
results_parsed = []
stringify = ["host", "channel"]
if "hits" in results.keys():
if "hits" in results["hits"]:
for item in results["hits"]["hits"]:
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
element = item[data_index]
for field in stringify:
if field in element:
element[field] = str(element[field])
# Why are fields in lists...
if data_index == "fields":
element = {k: v[0] for k, v in element.items() if len(v)}
element["id"] = item["_id"]
# Remove empty values
for field in list(element.keys()):
if element[field] == "":
del element[field]
# Split the timestamp into date and time
if "ts" not in element:
if "time" in element: # will fix data later
ts = element["time"]
del element["time"]
element["ts"] = ts
if "ts" in element:
if isinstance(element["ts"], str):
ts = element["ts"]
else:
ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
ts_spl = ts.split("T")
date = ts_spl[0]
time = ts_spl[1]
element["date"] = date
if "." in time:
time_spl = time.split(".")
if len(time_spl) == 2:
element["time"] = time.split(".")[0]
else:
element["time"] = time
else:
element["time"] = time
results_parsed.append(element)
return results_parsed

View File

@ -17,7 +17,7 @@
value="{{ params.query }}" value="{{ params.query }}"
class="input" class="input"
type="text" type="text"
placeholder="Token search: (science | tech | art) + (interest) -hello"> placeholder="Search something">
<span class="icon is-small is-left"> <span class="icon is-small is-left">
<i class="fas fa-magnifying-glass"></i> <i class="fas fa-magnifying-glass"></i>
</span> </span>
@ -76,28 +76,6 @@
</a> </a>
</p> </p>
</div> </div>
<div class="control">
<div class="field rounded-tooltip">
<input
id="full_query"
type="checkbox"
class="switch is-rounded is-info"
{% if params.query_full is not None %}checked="checked"{% else %}none{% endif %}
{% if False %}
{# what are you looking at? #}
disabled
{% endif %}
data-script="on click toggle .is-hidden on #query_full">
<label
for="full_query">
Full query
</label>
{% if False %}
{# what are you looking at? #}
<span class="tooltiptext tag is-danger is-light">No access</span>
{% endif %}
</div>
</div>
</div> </div>
<div class="column is-narrow"> <div class="column is-narrow">
<div class="field has-addons block"> <div class="field has-addons block">
@ -406,23 +384,6 @@
</div> </div>
</div> </div>
</div> </div>
<div id="query_full" class="block {% if params.query_full is None %}is-hidden{% endif %}">
<div class="control is-expanded has-icons-left">
<input
hx-post="{% url 'search' %}"
hx-trigger="keyup changed delay:200ms"
hx-target="#results"
hx-swap="innerHTML"
name="query_full"
value="{{ params.query_full }}"
class="input"
type="text"
placeholder="Full query: msg: science AND src: 4ch AND channel: 100293">
<span class="icon is-small is-left">
<i class="fas fa-magnifying-glass"></i>
</span>
</div>
</div>
<div class="block"> <div class="block">
<input <input
hx-trigger="change" hx-trigger="change"

View File

@ -12,7 +12,8 @@ from rest_framework.parsers import FormParser
from rest_framework.views import APIView from rest_framework.views import APIView
from core.lib.context import construct_query from core.lib.context import construct_query
#from core.lib.opensearch import query_results
# from core.lib.opensearch import query_results
from core.lib.manticore import query_results from core.lib.manticore import query_results
from core.lib.threshold import ( from core.lib.threshold import (
annotate_num_chans, annotate_num_chans,
@ -367,6 +368,8 @@ class DrilldownContextModal(APIView):
if query_params["type"] not in ["znc", "auth"]: if query_params["type"] not in ["znc", "auth"]:
annotate = True annotate = True
# Create the query with the context helper # Create the query with the context helper
if query_params["num"].isdigit():
query_params["num"] = int(query_params["num"])
search_query = construct_query( search_query = construct_query(
query_params["index"], query_params["index"],
query_params["net"], query_params["net"],
@ -403,6 +406,7 @@ class DrilldownContextModal(APIView):
# for index, item in enumerate(results["object_list"]): # for index, item in enumerate(results["object_list"]):
# results["object_list"][index]["time"] = item["time"]+"SSS" # results["object_list"][index]["time"] = item["time"]+"SSS"
unique = str(uuid.uuid4())[:8] unique = str(uuid.uuid4())[:8]
print("PARAMS", query_params)
context = { context = {
"net": query_params["net"], "net": query_params["net"],
"channel": query_params["channel"], "channel": query_params["channel"],