Fix some Manticore queries
This commit is contained in:
parent
3b8735be72
commit
87324de666
|
@ -4,9 +4,9 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
|
|||
extra_should = []
|
||||
extra_should2 = []
|
||||
if num:
|
||||
extra_must.append({"match": {"num": num}})
|
||||
extra_must.append({"equals": {"num": num}})
|
||||
if net:
|
||||
extra_must.append({"match": {"net": net}})
|
||||
extra_must.append({"match_phrase": {"net": net}})
|
||||
if channel:
|
||||
extra_must.append({"match": {"channel": channel}})
|
||||
if nicks:
|
||||
|
@ -52,31 +52,36 @@ def construct_query(index, net, channel, src, num, size, type=None, nicks=None):
|
|||
extra_should.append({"match": {"nick": channel}})
|
||||
else:
|
||||
for ctype in types:
|
||||
extra_should.append({"match": {"mtype": ctype}})
|
||||
extra_should.append({"equals": {"mtype": ctype}})
|
||||
else:
|
||||
for ctype in types:
|
||||
extra_should.append({"match": {"type": ctype}})
|
||||
query = {
|
||||
"size": size,
|
||||
"index": index,
|
||||
"limit": size,
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{"match": {"src": src}},
|
||||
{
|
||||
"bool": {
|
||||
"should": [*extra_should],
|
||||
}
|
||||
},
|
||||
{
|
||||
"bool": {
|
||||
"should": [*extra_should2],
|
||||
}
|
||||
},
|
||||
# {"equals": {"src": src}},
|
||||
# {
|
||||
# "bool": {
|
||||
# "should": [*extra_should],
|
||||
# }
|
||||
# },
|
||||
# {
|
||||
# "bool": {
|
||||
# "should": [*extra_should2],
|
||||
# }
|
||||
# },
|
||||
*extra_must,
|
||||
]
|
||||
}
|
||||
},
|
||||
"fields": fields,
|
||||
"_source": False,
|
||||
# "_source": False,
|
||||
}
|
||||
if extra_should:
|
||||
query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should]}})
|
||||
if extra_should2:
|
||||
query["query"]["bool"]["must"].append({"bool": {"should": [*extra_should2]}})
|
||||
return query
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
from re import search
|
||||
from django.conf import settings
|
||||
from core.lib.opensearch import annotate_results, filter_blacklisted, parse_results
|
||||
import manticoresearch
|
||||
from core.views.helpers import dedup_list
|
||||
from datetime import datetime
|
||||
from pprint import pprint
|
||||
|
||||
import manticoresearch
|
||||
from django.conf import settings
|
||||
|
||||
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
|
||||
from core.views.helpers import dedup_list
|
||||
|
||||
|
||||
def initialise_manticore():
|
||||
"""
|
||||
Initialise the Manticore client
|
||||
|
@ -15,8 +18,10 @@ def initialise_manticore():
|
|||
|
||||
return (api_client, api_instance)
|
||||
|
||||
|
||||
api_client, client = initialise_manticore()
|
||||
|
||||
|
||||
def construct_query(query, size, index, blank=False):
|
||||
"""
|
||||
Accept some query parameters and construct an OpenSearch query.
|
||||
|
@ -35,12 +40,14 @@ def construct_query(query, size, index, blank=False):
|
|||
query_base["query"]["bool"]["must"].append(query_string)
|
||||
return query_base
|
||||
|
||||
|
||||
def run_query(client, user, search_query):
|
||||
response = client.search(search_query)
|
||||
response = response.to_dict()
|
||||
filter_blacklisted(user, response)
|
||||
return response
|
||||
|
||||
|
||||
def query_results(
|
||||
request,
|
||||
query_params,
|
||||
|
@ -110,6 +117,9 @@ def query_results(
|
|||
query = query_params["query"]
|
||||
search_query = construct_query(query, size, index)
|
||||
query_created = True
|
||||
else:
|
||||
if custom_query:
|
||||
search_query = custom_query
|
||||
|
||||
if tags:
|
||||
# Get a blank search query
|
||||
|
@ -159,13 +169,16 @@ def query_results(
|
|||
add_top.append(add_top_tmp)
|
||||
print("AFTER", add_top)
|
||||
|
||||
|
||||
# Date/time range
|
||||
if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
|
||||
query_params.keys()
|
||||
):
|
||||
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
|
||||
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
|
||||
from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")
|
||||
to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")
|
||||
from_ts = int(from_ts.timestamp())
|
||||
to_ts = int(to_ts.timestamp())
|
||||
range_query = {
|
||||
"range": {
|
||||
"ts": {
|
||||
|
@ -247,7 +260,6 @@ def query_results(
|
|||
if sort:
|
||||
search_query["sort"] = sort
|
||||
|
||||
|
||||
pprint(search_query)
|
||||
results = run_query(
|
||||
client,
|
||||
|
@ -280,4 +292,5 @@ def query_results(
|
|||
"card": results["hits"]["total"],
|
||||
"took": results["took"],
|
||||
}
|
||||
print("RTRN", context)
|
||||
return context
|
|
@ -5,11 +5,10 @@ from django.conf import settings
|
|||
from opensearchpy import OpenSearch
|
||||
from opensearchpy.exceptions import NotFoundError, RequestError
|
||||
|
||||
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
|
||||
from core.views.helpers import dedup_list
|
||||
from datetime import datetime
|
||||
# from json import dumps
|
||||
# pp = lambda x: print(dumps(x, indent=2))
|
||||
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
|
||||
from core.views.helpers import dedup_list
|
||||
|
||||
|
||||
def initialise_opensearch():
|
||||
|
@ -37,114 +36,6 @@ def initialise_opensearch():
|
|||
client = initialise_opensearch()
|
||||
|
||||
|
||||
def annotate_results(results_parsed):
|
||||
"""
|
||||
Accept a list of dict objects, search for the number of channels and users.
|
||||
Add them to the object.
|
||||
Mutate it in place. Does not return anything.
|
||||
"""
|
||||
# Figure out items with net (not discord)
|
||||
nets = set()
|
||||
for x in results_parsed:
|
||||
if "net" in x:
|
||||
nets.add(x["net"])
|
||||
|
||||
for net in nets:
|
||||
# Annotate the online attribute from Threshold
|
||||
nicks = list(
|
||||
set(
|
||||
[
|
||||
x["nick"]
|
||||
for x in results_parsed
|
||||
if {"nick", "src", "net"}.issubset(x)
|
||||
and x["src"] == "irc"
|
||||
and x["net"] == net
|
||||
]
|
||||
)
|
||||
)
|
||||
channels = list(
|
||||
set(
|
||||
[
|
||||
x["channel"]
|
||||
for x in results_parsed
|
||||
if {"channel", "src", "net"}.issubset(x)
|
||||
and x["src"] == "irc"
|
||||
and x["net"] == net
|
||||
]
|
||||
)
|
||||
)
|
||||
online_info = annotate_online(net, nicks)
|
||||
# Annotate the number of users in the channel
|
||||
num_users = annotate_num_users(net, channels)
|
||||
# Annotate the number channels the user is on
|
||||
num_chans = annotate_num_chans(net, nicks)
|
||||
for item in results_parsed:
|
||||
if "net" in item:
|
||||
if item["net"] == net:
|
||||
if "nick" in item:
|
||||
if item["nick"] in online_info:
|
||||
item["online"] = online_info[item["nick"]]
|
||||
if "channel" in item:
|
||||
if item["channel"] in num_users:
|
||||
item["num_users"] = num_users[item["channel"]]
|
||||
if "nick" in item:
|
||||
if item["nick"] in num_chans:
|
||||
item["num_chans"] = num_chans[item["nick"]]
|
||||
|
||||
|
||||
def filter_blacklisted(user, response):
|
||||
"""
|
||||
Low level filter to take the raw OpenSearch response and remove
|
||||
objects from it we want to keep secret.
|
||||
Does not return, the object is mutated in place.
|
||||
"""
|
||||
response["redacted"] = 0
|
||||
response["exemption"] = None
|
||||
if user.is_superuser:
|
||||
response["exemption"] = True
|
||||
# is_anonymous = isinstance(user, AnonymousUser)
|
||||
# For every hit from ES
|
||||
for index, item in enumerate(list(response["hits"]["hits"])):
|
||||
# For every blacklisted type
|
||||
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
|
||||
# Check this field we are matching exists
|
||||
if "_source" in item.keys():
|
||||
data_index = "_source"
|
||||
elif "fields" in item.keys():
|
||||
data_index = "fields"
|
||||
else:
|
||||
return False
|
||||
if blacklisted_type in item[data_index].keys():
|
||||
content = item[data_index][blacklisted_type]
|
||||
# For every item in the blacklisted array for the type
|
||||
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
|
||||
blacklisted_type
|
||||
]:
|
||||
if blacklisted_item == str(content):
|
||||
# Remove the item
|
||||
if item in response["hits"]["hits"]:
|
||||
# Let the UI know something was redacted
|
||||
if (
|
||||
"exemption"
|
||||
not in response["hits"]["hits"][index][data_index]
|
||||
):
|
||||
response["redacted"] += 1
|
||||
# Anonymous
|
||||
if user.is_anonymous:
|
||||
# Just set it to none so the index is not off
|
||||
response["hits"]["hits"][index] = None
|
||||
else:
|
||||
if not user.has_perm("core.bypass_blacklist"):
|
||||
response["hits"]["hits"][index] = None
|
||||
else:
|
||||
response["hits"]["hits"][index][data_index][
|
||||
"exemption"
|
||||
] = True
|
||||
|
||||
# Actually get rid of all the things we set to None
|
||||
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
|
||||
|
||||
|
||||
def construct_query(query, size, use_query_string=True, tokens=False):
|
||||
"""
|
||||
Accept some query parameters and construct an OpenSearch query.
|
||||
|
@ -233,54 +124,6 @@ def run_main_query(client, user, query, custom_query=False, index=None, size=Non
|
|||
return response
|
||||
|
||||
|
||||
def parse_results(results):
|
||||
results_parsed = []
|
||||
stringify = ["host", "channel"]
|
||||
if "hits" in results.keys():
|
||||
if "hits" in results["hits"]:
|
||||
for item in results["hits"]["hits"]:
|
||||
if "_source" in item.keys():
|
||||
data_index = "_source"
|
||||
elif "fields" in item.keys():
|
||||
data_index = "fields"
|
||||
else:
|
||||
return False
|
||||
element = item[data_index]
|
||||
for field in stringify:
|
||||
if field in element:
|
||||
element[field] = str(element[field])
|
||||
# Why are fields in lists...
|
||||
if data_index == "fields":
|
||||
element = {k: v[0] for k, v in element.items() if len(v)}
|
||||
element["id"] = item["_id"]
|
||||
|
||||
# Split the timestamp into date and time
|
||||
if "ts" not in element:
|
||||
if "time" in element: # will fix data later
|
||||
ts = element["time"]
|
||||
del element["time"]
|
||||
element["ts"] = ts
|
||||
if "ts" in element:
|
||||
if isinstance(element["ts"], str):
|
||||
ts = element["ts"]
|
||||
else:
|
||||
ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
|
||||
ts_spl = ts.split("T")
|
||||
date = ts_spl[0]
|
||||
time = ts_spl[1]
|
||||
element["date"] = date
|
||||
if "." in time:
|
||||
time_spl = time.split(".")
|
||||
if len(time_spl) == 2:
|
||||
element["time"] = time.split(".")[0]
|
||||
else:
|
||||
element["time"] = time
|
||||
else:
|
||||
element["time"] = time
|
||||
results_parsed.append(element)
|
||||
return results_parsed
|
||||
|
||||
|
||||
def query_results(
|
||||
request,
|
||||
query_params,
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
|
||||
from django.conf import settings
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def annotate_results(results_parsed):
|
||||
"""
|
||||
Accept a list of dict objects, search for the number of channels and users.
|
||||
Add them to the object.
|
||||
Mutate it in place. Does not return anything.
|
||||
"""
|
||||
# Figure out items with net (not discord)
|
||||
nets = set()
|
||||
for x in results_parsed:
|
||||
if "net" in x:
|
||||
nets.add(x["net"])
|
||||
|
||||
for net in nets:
|
||||
# Annotate the online attribute from Threshold
|
||||
nicks = list(
|
||||
set(
|
||||
[
|
||||
x["nick"]
|
||||
for x in results_parsed
|
||||
if {"nick", "src", "net"}.issubset(x)
|
||||
and x["src"] == "irc"
|
||||
and x["net"] == net
|
||||
]
|
||||
)
|
||||
)
|
||||
channels = list(
|
||||
set(
|
||||
[
|
||||
x["channel"]
|
||||
for x in results_parsed
|
||||
if {"channel", "src", "net"}.issubset(x)
|
||||
and x["src"] == "irc"
|
||||
and x["net"] == net
|
||||
]
|
||||
)
|
||||
)
|
||||
online_info = annotate_online(net, nicks)
|
||||
# Annotate the number of users in the channel
|
||||
num_users = annotate_num_users(net, channels)
|
||||
# Annotate the number channels the user is on
|
||||
num_chans = annotate_num_chans(net, nicks)
|
||||
for item in results_parsed:
|
||||
if "net" in item:
|
||||
if item["net"] == net:
|
||||
if "nick" in item:
|
||||
if item["nick"] in online_info:
|
||||
item["online"] = online_info[item["nick"]]
|
||||
if "channel" in item:
|
||||
if item["channel"] in num_users:
|
||||
item["num_users"] = num_users[item["channel"]]
|
||||
if "nick" in item:
|
||||
if item["nick"] in num_chans:
|
||||
item["num_chans"] = num_chans[item["nick"]]
|
||||
|
||||
|
||||
def filter_blacklisted(user, response):
|
||||
"""
|
||||
Low level filter to take the raw OpenSearch response and remove
|
||||
objects from it we want to keep secret.
|
||||
Does not return, the object is mutated in place.
|
||||
"""
|
||||
response["redacted"] = 0
|
||||
response["exemption"] = None
|
||||
if user.is_superuser:
|
||||
response["exemption"] = True
|
||||
# is_anonymous = isinstance(user, AnonymousUser)
|
||||
# For every hit from ES
|
||||
for index, item in enumerate(list(response["hits"]["hits"])):
|
||||
# For every blacklisted type
|
||||
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
|
||||
# Check this field we are matching exists
|
||||
if "_source" in item.keys():
|
||||
data_index = "_source"
|
||||
elif "fields" in item.keys():
|
||||
data_index = "fields"
|
||||
else:
|
||||
return False
|
||||
if blacklisted_type in item[data_index].keys():
|
||||
content = item[data_index][blacklisted_type]
|
||||
# For every item in the blacklisted array for the type
|
||||
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
|
||||
blacklisted_type
|
||||
]:
|
||||
if blacklisted_item == str(content):
|
||||
# Remove the item
|
||||
if item in response["hits"]["hits"]:
|
||||
# Let the UI know something was redacted
|
||||
if (
|
||||
"exemption"
|
||||
not in response["hits"]["hits"][index][data_index]
|
||||
):
|
||||
response["redacted"] += 1
|
||||
# Anonymous
|
||||
if user.is_anonymous:
|
||||
# Just set it to none so the index is not off
|
||||
response["hits"]["hits"][index] = None
|
||||
else:
|
||||
if not user.has_perm("core.bypass_blacklist"):
|
||||
response["hits"]["hits"][index] = None
|
||||
else:
|
||||
response["hits"]["hits"][index][data_index][
|
||||
"exemption"
|
||||
] = True
|
||||
|
||||
# Actually get rid of all the things we set to None
|
||||
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
|
||||
|
||||
|
||||
def parse_results(results):
|
||||
results_parsed = []
|
||||
stringify = ["host", "channel"]
|
||||
if "hits" in results.keys():
|
||||
if "hits" in results["hits"]:
|
||||
for item in results["hits"]["hits"]:
|
||||
if "_source" in item.keys():
|
||||
data_index = "_source"
|
||||
elif "fields" in item.keys():
|
||||
data_index = "fields"
|
||||
else:
|
||||
return False
|
||||
element = item[data_index]
|
||||
for field in stringify:
|
||||
if field in element:
|
||||
element[field] = str(element[field])
|
||||
# Why are fields in lists...
|
||||
if data_index == "fields":
|
||||
element = {k: v[0] for k, v in element.items() if len(v)}
|
||||
element["id"] = item["_id"]
|
||||
|
||||
# Remove empty values
|
||||
for field in list(element.keys()):
|
||||
if element[field] == "":
|
||||
del element[field]
|
||||
|
||||
# Split the timestamp into date and time
|
||||
if "ts" not in element:
|
||||
if "time" in element: # will fix data later
|
||||
ts = element["time"]
|
||||
del element["time"]
|
||||
element["ts"] = ts
|
||||
if "ts" in element:
|
||||
if isinstance(element["ts"], str):
|
||||
ts = element["ts"]
|
||||
else:
|
||||
ts = datetime.utcfromtimestamp(element["ts"]).strftime('%Y-%m-%dT%H:%M:%S')
|
||||
ts_spl = ts.split("T")
|
||||
date = ts_spl[0]
|
||||
time = ts_spl[1]
|
||||
element["date"] = date
|
||||
if "." in time:
|
||||
time_spl = time.split(".")
|
||||
if len(time_spl) == 2:
|
||||
element["time"] = time.split(".")[0]
|
||||
else:
|
||||
element["time"] = time
|
||||
else:
|
||||
element["time"] = time
|
||||
results_parsed.append(element)
|
||||
return results_parsed
|
|
@ -17,7 +17,7 @@
|
|||
value="{{ params.query }}"
|
||||
class="input"
|
||||
type="text"
|
||||
placeholder="Token search: (science | tech | art) + (interest) -hello">
|
||||
placeholder="Search something">
|
||||
<span class="icon is-small is-left">
|
||||
<i class="fas fa-magnifying-glass"></i>
|
||||
</span>
|
||||
|
@ -76,28 +76,6 @@
|
|||
</a>
|
||||
</p>
|
||||
</div>
|
||||
<div class="control">
|
||||
<div class="field rounded-tooltip">
|
||||
<input
|
||||
id="full_query"
|
||||
type="checkbox"
|
||||
class="switch is-rounded is-info"
|
||||
{% if params.query_full is not None %}checked="checked"{% else %}none{% endif %}
|
||||
{% if False %}
|
||||
{# what are you looking at? #}
|
||||
disabled
|
||||
{% endif %}
|
||||
data-script="on click toggle .is-hidden on #query_full">
|
||||
<label
|
||||
for="full_query">
|
||||
Full query
|
||||
</label>
|
||||
{% if False %}
|
||||
{# what are you looking at? #}
|
||||
<span class="tooltiptext tag is-danger is-light">No access</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="column is-narrow">
|
||||
<div class="field has-addons block">
|
||||
|
@ -406,23 +384,6 @@
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="query_full" class="block {% if params.query_full is None %}is-hidden{% endif %}">
|
||||
<div class="control is-expanded has-icons-left">
|
||||
<input
|
||||
hx-post="{% url 'search' %}"
|
||||
hx-trigger="keyup changed delay:200ms"
|
||||
hx-target="#results"
|
||||
hx-swap="innerHTML"
|
||||
name="query_full"
|
||||
value="{{ params.query_full }}"
|
||||
class="input"
|
||||
type="text"
|
||||
placeholder="Full query: msg: science AND src: 4ch AND channel: 100293">
|
||||
<span class="icon is-small is-left">
|
||||
<i class="fas fa-magnifying-glass"></i>
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="block">
|
||||
<input
|
||||
hx-trigger="change"
|
||||
|
|
|
@ -12,6 +12,7 @@ from rest_framework.parsers import FormParser
|
|||
from rest_framework.views import APIView
|
||||
|
||||
from core.lib.context import construct_query
|
||||
|
||||
# from core.lib.opensearch import query_results
|
||||
from core.lib.manticore import query_results
|
||||
from core.lib.threshold import (
|
||||
|
@ -367,6 +368,8 @@ class DrilldownContextModal(APIView):
|
|||
if query_params["type"] not in ["znc", "auth"]:
|
||||
annotate = True
|
||||
# Create the query with the context helper
|
||||
if query_params["num"].isdigit():
|
||||
query_params["num"] = int(query_params["num"])
|
||||
search_query = construct_query(
|
||||
query_params["index"],
|
||||
query_params["net"],
|
||||
|
@ -403,6 +406,7 @@ class DrilldownContextModal(APIView):
|
|||
# for index, item in enumerate(results["object_list"]):
|
||||
# results["object_list"][index]["time"] = item["time"]+"SSS"
|
||||
unique = str(uuid.uuid4())[:8]
|
||||
print("PARAMS", query_params)
|
||||
context = {
|
||||
"net": query_params["net"],
|
||||
"channel": query_params["channel"],
|
||||
|
|
Loading…
Reference in New Issue