Begin implementing DB framework

This commit is contained in:
Mark Veidemanis 2022-09-27 15:15:08 +01:00
parent 845b02b0eb
commit 202a13cccb
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
21 changed files with 1316 additions and 968 deletions

View File

@ -63,14 +63,15 @@ from core.views.ui.drilldown import ( # DrilldownTableView,; Drilldown,
DrilldownTableView, DrilldownTableView,
ThresholdInfoModal, ThresholdInfoModal,
) )
from core.views.ui.insights import (
Insights, # from core.views.ui.insights import (
InsightsChannels, # Insights,
InsightsInfoModal, # InsightsChannels,
InsightsMeta, # InsightsInfoModal,
InsightsNicks, # InsightsMeta,
InsightsSearch, # InsightsNicks,
) # InsightsSearch,
# )
urlpatterns = [ urlpatterns = [
path("__debug__/", include("debug_toolbar.urls")), path("__debug__/", include("debug_toolbar.urls")),
@ -100,12 +101,12 @@ urlpatterns = [
path("context/", DrilldownContextModal.as_view(), name="modal_context"), path("context/", DrilldownContextModal.as_view(), name="modal_context"),
path("context_table/", DrilldownContextModal.as_view(), name="modal_context_table"), path("context_table/", DrilldownContextModal.as_view(), name="modal_context_table"),
## ##
path("ui/insights/", Insights.as_view(), name="insights"), # path("ui/insights/", Insights.as_view(), name="insights"),
path("ui/insights/search/", InsightsSearch.as_view(), name="search_insights"), # path("ui/insights/search/", InsightsSearch.as_view(), name="search_insights"),
path("ui/insights/channels/", InsightsChannels.as_view(), name="chans_insights"), # path("ui/insights/channels/", InsightsChannels.as_view(), name="chans_insights"),
path("ui/insights/nicks/", InsightsNicks.as_view(), name="nicks_insights"), # path("ui/insights/nicks/", InsightsNicks.as_view(), name="nicks_insights"),
path("ui/insights/meta/", InsightsMeta.as_view(), name="meta_insights"), # path("ui/insights/meta/", InsightsMeta.as_view(), name="meta_insights"),
path("ui/insights/modal/", InsightsInfoModal.as_view(), name="modal_insights"), # path("ui/insights/modal/", InsightsInfoModal.as_view(), name="modal_insights"),
## ##
path( path(
"manage/threshold/irc/overview/", "manage/threshold/irc/overview/",

234
core/db/__init__.py Normal file
View File

@ -0,0 +1,234 @@
import random
import string
import time
from math import floor, log10
import orjson
from django.conf import settings
from siphashc import siphash
from core import r
from core.db.processing import annotate_results
from core.util import logs
class StorageBackend(object):
def __init__(self, name):
self.log = logs.get_logger(name)
self.log.info(f"Initialising storage backend {name}")
self.initialise_caching()
self.initialise()
def initialise(self, **kwargs):
raise NotImplementedError
def initialise_caching(self):
hash_key = r.get("cache_hash_key")
if not hash_key:
letters = string.ascii_lowercase
hash_key = "".join(random.choice(letters) for i in range(16))
self.log.debug(f"Created new hash key: {hash_key}")
r.set("cache_hash_key", hash_key)
else:
hash_key = hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {hash_key}")
self.hash_key = hash_key
def construct_query(self, **kwargs):
raise NotImplementedError
def run_query(self, **kwargs):
raise NotImplementedError
def parse_size(self, query_params, sizes):
if "size" in query_params:
size = query_params["size"]
if size not in sizes:
message = "Size is not permitted"
message_class = "danger"
return {"message": message, "class": message_class}
size = int(size)
else:
size = 15
return size
def parse_index(self, user, query_params):
if "index" in query_params:
index = query_params["index"]
if index == "main":
index = settings.INDEX_MAIN
else:
if not user.has_perm(f"core.index_{index}"):
message = "Not permitted to search by this index"
message_class = "danger"
return {
"message": message,
"class": message_class,
}
if index == "meta":
index = settings.INDEX_META
elif index == "internal":
index = settings.INDEX_INT
else:
message = "Index is not valid."
message_class = "danger"
return {
"message": message,
"class": message_class,
}
else:
index = settings.INDEX_MAIN
return index
def parse_query(self, query_params, tags, size, index, custom_query, add_bool):
if "query" in query_params:
query = query_params["query"]
search_query = self.construct_query(query, size, index)
query_created = True
else:
if custom_query:
search_query = custom_query
if tags:
# Get a blank search query
if not query_created:
search_query = self.construct_query(None, size, index, blank=True)
query_created = True
for tagname, tagvalue in tags.items():
add_bool.append({tagname: tagvalue})
required_any = ["query", "tags"]
if not any([field in query_params.keys() for field in required_any]):
if not custom_query:
message = "Empty query!"
message_class = "warning"
return {"message": message, "class": message_class}
return search_query
def parse_source(self, user, query_params):
if "source" in query_params:
source = query_params["source"]
if source in settings.SOURCES_RESTRICTED:
if not user.has_perm("core.restricted_sources"):
message = "Access denied"
message_class = "danger"
return {"message": message, "class": message_class}
elif source not in settings.MAIN_SOURCES:
message = "Invalid source"
message_class = "danger"
return {"message": message, "class": message_class}
if source == "all":
source = None # the next block will populate it
if source:
sources = [source]
else:
sources = list(settings.MAIN_SOURCES)
if user.has_perm("core.restricted_sources"):
for source_iter in settings.SOURCES_RESTRICTED:
sources.append(source_iter)
return sources
def filter_blacklisted(self, user, response):
"""
Low level filter to take the raw OpenSearch response and remove
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
if blacklisted_type in item[data_index].keys():
content = item[data_index][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item == str(content):
# Remove the item
if item in response["hits"]["hits"]:
# Let the UI know something was redacted
if (
"exemption"
not in response["hits"]["hits"][index][data_index]
):
response["redacted"] += 1
# Anonymous
if user.is_anonymous:
# Just set it to none so the index is not off
response["hits"]["hits"][index] = None
else:
if not user.has_perm("core.bypass_blacklist"):
response["hits"]["hits"][index] = None
else:
response["hits"]["hits"][index][data_index][
"exemption"
] = True
# Actually get rid of all the things we set to None
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
def query(self, user, search_query):
# For time tracking
start = time.process_time()
if settings.CACHE:
# Sort the keys so the hash is the same
query_normalised = orjson.dumps(search_query, option=orjson.OPT_SORT_KEYS)
hash = siphash(self.hash_key, query_normalised)
cache_hit = r.get(f"query_cache.{user.id}.{hash}")
if cache_hit:
response = orjson.loads(cache_hit)
response["cache"] = True
return response
response = self.run_query(user, search_query)
if "error" in response and len(response.keys()) == 1:
return response
# response = response.to_dict()
# print("RESP", response)
if "took" in response:
if response["took"] is None:
return None
self.filter_blacklisted(user, response)
# Write cache
if settings.CACHE:
to_write_cache = orjson.dumps(response)
r.set(f"query_cache.{user.id}.{hash}", to_write_cache)
r.expire(f"query_cache.{user.id}.{hash}", settings.CACHE_TIMEOUT)
# Parse the response
response_parsed = self.parse(response)
time_took = (time.process_time() - start) * 1000
# Round to 3 significant figures
time_took_rounded = round(time_took, 3 - int(floor(log10(abs(time_took)))) - 1)
return {"object_list": response_parsed, "took": time_took_rounded}
def query_results(self, **kwargs):
raise NotImplementedError
def process_results(self, **kwargs):
if kwargs.get("annotate"):
annotate_results(kwargs["results"])
def parse(self, response):
raise NotImplementedError

153
core/db/druid.py Normal file
View File

@ -0,0 +1,153 @@
import logging
import random
import string
import time
from datetime import datetime
from math import floor, log10
from pprint import pprint
import orjson
import requests
from django.conf import settings
from siphashc import siphash
from core import r
from core.db import StorageBackend
from core.db.processing import parse_druid
from core.views import helpers
logger = logging.getLogger(__name__)
class DruidBackend(StorageBackend):
def __init__(self):
super().__init__("Druid")
def initialise(self, **kwargs):
# self.client = PyDruid("http://broker:8082", "druid/v2")
pass # we use requests
def construct_query(self, query, size, index, blank=False):
search_query = {
"limit": size,
"queryType": "scan",
"dataSource": index,
"filter": {
"type": "and",
"fields": [
],
},
# "resultFormat": "list",
# "columns":[],
"intervals": ["1000-01-01/3000-01-01"],
# "batchSize": 20480,
}
to_add = {
"type": "search",
"dimension": "msg",
"query": {
"type": "insensitive_contains",
"value": query,
},
},
if blank:
return search_query
else:
search_query["filter"]["fields"].append(to_add)
return search_query
def parse(self, response):
parsed = parse_druid(response)
print("PARSE LEN", len(parsed))
return parsed
def run_query(self, user, search_query):
response = requests.post("http://broker:8082/druid/v2", json=search_query)
response = orjson.loads(response.text)
print("RESPONSE LEN", len(response))
ss = orjson.dumps(list(response), option=orjson.OPT_INDENT_2)
ss = ss.decode()
print(ss)
return response
def filter_blacklisted(self, user, response):
pass
def query_results(
self,
request,
query_params,
size=None,
annotate=True,
custom_query=False,
reverse=False,
dedup=False,
dedup_fields=None,
tags=None,
):
add_bool = []
add_top = []
helpers.add_defaults(query_params)
# Check size
if request.user.is_anonymous:
sizes = settings.MAIN_SIZES_ANON
else:
sizes = settings.MAIN_SIZES
if not size:
size = self.parse_size(query_params, sizes)
if isinstance(size, dict):
return size
# Check index
index = self.parse_index(request.user, query_params)
if isinstance(index, dict):
return index
# Create the search query
search_query = self.parse_query(query_params, tags, size, index, custom_query, add_bool)
if isinstance(search_query, dict):
return search_query
sources = self.parse_source(request.user, query_params)
# TODO
add_top_tmp = {"bool": {"should": []}}
total_count = 0
for source_iter in sources:
add_top_tmp["bool"]["should"].append({"equals": {"src": source_iter}})
total_count += 1
total_sources = len(settings.MAIN_SOURCES) + len(
settings.SOURCES_RESTRICTED
)
if not total_count == total_sources:
add_top.append(add_top_tmp)
print("SIZE IS", size)
if add_bool:
self.add_bool(search_query, add_bool)
response = self.query(request.user, search_query)
# print("RESP", response)
# ss = orjson.dumps(list(response), option=orjson.OPT_INDENT_2)
# ss = ss.decode()
# print(ss)
# print("PARSED", results_parsed)
# return results_parsed
context = response
return context
def add_bool(self, search_query, add_bool):
if "filter" in search_query:
if "fields" in search_query["filter"]:
search_query["filter"]["fields"].append({"bool": {"should": add_bool}})
else:
search_query["filter"]["fields"] = [{"bool": {"should": add_bool}}]
else:
search_query["filter"] = {"bool": {"should": add_bool}}

311
core/db/manticore.py Normal file
View File

@ -0,0 +1,311 @@
import logging
import random
import string
import time
from datetime import datetime
from math import floor, log10
from pprint import pprint
import orjson
import requests
from django.conf import settings
from core import r
from core.db import StorageBackend
from core.db.processing import annotate_results, filter_blacklisted, parse_results
from core.views import helpers
logger = logging.getLogger(__name__)
class ManticoreBackend(StorageBackend):
def __init__(self):
super().__init__("Manticore")
def initialise(self, **kwargs):
"""
Initialise the Manticore client
"""
pass # we use requests
def construct_query(self, query, size, index, blank=False):
"""
Accept some query parameters and construct an OpenSearch query.
"""
if not size:
size = 5
query_base = {
"index": index,
"limit": size,
"query": {"bool": {"must": []}},
}
query_string = {
"query_string": query,
}
if not blank:
query_base["query"]["bool"]["must"].append(query_string)
return query_base
def run_query(self, client, user, search_query):
response = requests.post(
f"{settings.MANTICORE_URL}/json/search", json=search_query
)
return response
def query_results(
self,
request,
query_params,
size=None,
annotate=True,
custom_query=False,
reverse=False,
dedup=False,
dedup_fields=None,
tags=None,
):
query = None
message = None
message_class = None
add_bool = []
add_top = []
add_top_negative = []
sort = None
query_created = False
source = None
helpers.add_defaults(query_params)
# Check size
if request.user.is_anonymous:
sizes = settings.MANTICORE_MAIN_SIZES_ANON
else:
sizes = settings.MANTICORE_MAIN_SIZES
if not size:
if "size" in query_params:
size = query_params["size"]
if size not in sizes:
message = "Size is not permitted"
message_class = "danger"
return {"message": message, "class": message_class}
size = int(size)
else:
size = 20
# Check index
if "index" in query_params:
index = query_params["index"]
if index == "main":
index = settings.MANTICORE_INDEX_MAIN
else:
if not request.user.has_perm(f"core.index_{index}"):
message = "Not permitted to search by this index"
message_class = "danger"
return {
"message": message,
"class": message_class,
}
if index == "meta":
index = settings.MANTICORE_INDEX_META
elif index == "internal":
index = settings.MANTICORE_INDEX_INT
else:
message = "Index is not valid."
message_class = "danger"
return {
"message": message,
"class": message_class,
}
else:
index = settings.MANTICORE_INDEX_MAIN
# Create the search query
if "query" in query_params:
query = query_params["query"]
search_query = construct_query(query, size, index)
query_created = True
else:
if custom_query:
search_query = custom_query
if tags:
# Get a blank search query
if not query_created:
search_query = construct_query(None, size, index, blank=True)
query_created = True
for tagname, tagvalue in tags.items():
add_bool.append({tagname: tagvalue})
required_any = ["query_full", "query", "tags"]
if not any([field in query_params.keys() for field in required_any]):
if not custom_query:
message = "Empty query!"
message_class = "warning"
return {"message": message, "class": message_class}
# Check for a source
if "source" in query_params:
source = query_params["source"]
if source in settings.SOURCES_RESTRICTED:
if not request.user.has_perm("core.restricted_sources"):
message = "Access denied"
message_class = "danger"
return {"message": message, "class": message_class}
elif source not in settings.MAIN_SOURCES:
message = "Invalid source"
message_class = "danger"
return {"message": message, "class": message_class}
if source == "all":
source = None # the next block will populate it
if source:
sources = [source]
else:
sources = list(settings.MAIN_SOURCES)
if request.user.has_perm("core.restricted_sources"):
for source_iter in settings.SOURCES_RESTRICTED:
sources.append(source_iter)
add_top_tmp = {"bool": {"should": []}}
total_count = 0
for source_iter in sources:
add_top_tmp["bool"]["should"].append({"equals": {"src": source_iter}})
total_count += 1
total_sources = len(settings.MAIN_SOURCES) + len(
settings.SOURCES_RESTRICTED
)
if not total_count == total_sources:
add_top.append(add_top_tmp)
# Date/time range
if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
query_params.keys()
):
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")
to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")
from_ts = int(from_ts.timestamp())
to_ts = int(to_ts.timestamp())
range_query = {
"range": {
"ts": {
"gt": from_ts,
"lt": to_ts,
}
}
}
add_top.append(range_query)
# Sorting
if "sorting" in query_params:
sorting = query_params["sorting"]
if sorting not in ("asc", "desc", "none"):
message = "Invalid sort"
message_class = "danger"
return {"message": message, "class": message_class}
if sorting in ("asc", "desc"):
sort = [
{
"ts": {
"order": sorting,
}
}
]
# Sentiment handling
if "check_sentiment" in query_params:
if "sentiment_method" not in query_params:
message = "No sentiment method"
message_class = "danger"
return {"message": message, "class": message_class}
if "sentiment" in query_params:
sentiment = query_params["sentiment"]
try:
sentiment = float(sentiment)
except ValueError:
message = "Sentiment is not a float"
message_class = "danger"
return {"message": message, "class": message_class}
sentiment_method = query_params["sentiment_method"]
range_query_compare = {"range": {"sentiment": {}}}
range_query_precise = {
"match": {
"sentiment": None,
}
}
if sentiment_method == "below":
range_query_compare["range"]["sentiment"]["lt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "above":
range_query_compare["range"]["sentiment"]["gt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "exact":
range_query_precise["match"]["sentiment"] = sentiment
add_top.append(range_query_precise)
elif sentiment_method == "nonzero":
range_query_precise["match"]["sentiment"] = 0
add_top_negative.append(range_query_precise)
if add_bool:
# if "bool" not in search_query["query"]:
# search_query["query"]["bool"] = {}
# if "must" not in search_query["query"]["bool"]:
# search_query["query"]["bool"] = {"must": []}
for item in add_bool:
search_query["query"]["bool"]["must"].append({"match": item})
if add_top:
for item in add_top:
search_query["query"]["bool"]["must"].append(item)
if add_top_negative:
for item in add_top_negative:
if "must_not" in search_query["query"]["bool"]:
search_query["query"]["bool"]["must_not"].append(item)
else:
search_query["query"]["bool"]["must_not"] = [item]
if sort:
search_query["sort"] = sort
pprint(search_query)
results = run_query(
client,
request.user, # passed through run_main_query to filter_blacklisted
search_query,
)
if not results:
message = "Error running query"
message_class = "danger"
return {"message": message, "class": message_class}
# results = results.to_dict()
if "error" in results:
message = results["error"]
message_class = "danger"
return {"message": message, "class": message_class}
results_parsed = parse_results(results)
if annotate:
annotate_results(results_parsed)
if "dedup" in query_params:
if query_params["dedup"] == "on":
dedup = True
else:
dedup = False
else:
dedup = False
if reverse:
results_parsed = results_parsed[::-1]
if dedup:
if not dedup_fields:
dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
results_parsed = helpers.dedup_list(results_parsed, dedup_fields)
context = {
"object_list": results_parsed,
"card": results["hits"]["total"],
"took": results["took"],
}
if "cache" in results:
context["cache"] = results["cache"]
return context

485
core/db/opensearch.py Normal file
View File

@ -0,0 +1,485 @@
# from copy import deepcopy
# from datetime import datetime, timedelta
from django.conf import settings
from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError, RequestError
from core.db import StorageBackend
# from json import dumps
# pp = lambda x: print(dumps(x, indent=2))
from core.db.processing import annotate_results, filter_blacklisted, parse_results
from core.views.helpers import dedup_list
class OpensearchBackend(StorageBackend):
def __init__(self):
super().__init__("Opensearch")
def initialise(self, **kwargs):
"""
Inititialise the OpenSearch API endpoint.
"""
auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD)
client = OpenSearch(
# fmt: off
hosts=[{"host": settings.OPENSEARCH_URL,
"port": settings.OPENSEARCH_PORT}],
http_compress=False, # enables gzip compression for request bodies
http_auth=auth,
# client_cert = client_cert_path,
# client_key = client_key_path,
use_ssl=settings.OPENSEARCH_TLS,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
# a_certs=ca_certs_path,
)
self.client = client
def construct_query(self, query, size, use_query_string=True, tokens=False):
"""
Accept some query parameters and construct an OpenSearch query.
"""
if not size:
size = 5
query_base = {
"size": size,
"query": {"bool": {"must": []}},
}
query_string = {
"query_string": {
"query": query,
# "fields": fields,
# "default_field": "msg",
# "type": "best_fields",
"fuzziness": "AUTO",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
# "minimum_should_match": 1,
"default_operator": "or",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"allow_leading_wildcard": True,
# "enable_position_increments": False,
"phrase_slop": 3,
# "max_determinized_states": 10000,
"quote_field_suffix": "",
"quote_analyzer": "standard",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": True,
}
}
query_tokens = {
"simple_query_string": {
# "tokens": query,
"query": query,
"fields": ["tokens"],
"flags": "ALL",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
"default_operator": "and",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"quote_field_suffix": "",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": False,
}
}
if tokens:
query_base["query"]["bool"]["must"].append(query_tokens)
# query["query"]["bool"]["must"].append(query_string)
# query["query"]["bool"]["must"][0]["query_string"]["fields"] = ["tokens"]
elif use_query_string:
query_base["query"]["bool"]["must"].append(query_string)
return query_base
def run_query(self, client, user, query, custom_query=False, index=None, size=None):
"""
Low level helper to run an ES query.
Accept a user to pass it to the filter, so we can
avoid filtering for superusers.
Accept fields and size, for the fields we want to match and the
number of results to return.
"""
if not index:
index = settings.INDEX_MAIN
if custom_query:
search_query = query
else:
search_query = self.construct_query(query, size)
try:
response = client.search(body=search_query, index=index)
except RequestError as err:
print("OpenSearch error", err)
return err
except NotFoundError as err:
print("OpenSearch error", err)
return err
return response
def query_results(
self,
request,
query_params,
size=None,
annotate=True,
custom_query=False,
reverse=False,
dedup=False,
dedup_fields=None,
lookup_hashes=True,
tags=None,
):
"""
API helper to alter the OpenSearch return format into something
a bit better to parse.
Accept a HTTP request object. Run the query, and annotate the
results with the other data we have.
"""
# is_anonymous = isinstance(request.user, AnonymousUser)
query = None
message = None
message_class = None
add_bool = []
add_top = []
add_top_negative = []
sort = None
query_created = False
# Lookup the hash values but don't disclose them to the user
# denied = []
# if lookup_hashes:
# if settings.HASHING:
# query_params = deepcopy(query_params)
# denied_q = hash_lookup(request.user, query_params)
# denied.extend(denied_q)
# if tags:
# denied_t = hash_lookup(request.user, tags, query_params)
# denied.extend(denied_t)
# message = "Permission denied: "
# for x in denied:
# if isinstance(x, SearchDenied):
# message += f"Search({x.key}: {x.value}) "
# elif isinstance(x, LookupDenied):
# message += f"Lookup({x.key}: {x.value}) "
# if denied:
# # message = [f"{i}" for i in message]
# # message = "\n".join(message)
# message_class = "danger"
# return {"message": message, "class": message_class}
if request.user.is_anonymous:
sizes = settings.MAIN_SIZES_ANON
else:
sizes = settings.MAIN_SIZES
if not size:
if "size" in query_params:
size = query_params["size"]
if size not in sizes:
message = "Size is not permitted"
message_class = "danger"
return {"message": message, "class": message_class}
else:
size = 20
source = None
if "source" in query_params:
source = query_params["source"]
if source in settings.SOURCES_RESTRICTED:
if not request.user.has_perm("core.restricted_sources"):
message = "Access denied"
message_class = "danger"
return {"message": message, "class": message_class}
elif source not in settings.MAIN_SOURCES:
message = "Invalid source"
message_class = "danger"
return {"message": message, "class": message_class}
if source == "all":
source = None # the next block will populate it
if source:
sources = [source]
else:
sources = settings.MAIN_SOURCES
if request.user.has_perm("core.restricted_sources"):
for source_iter in settings.SOURCES_RESTRICTED:
sources.append(source_iter)
add_top_tmp = {"bool": {"should": []}}
for source_iter in sources:
add_top_tmp["bool"]["should"].append({"match_phrase": {"src": source_iter}})
add_top.append(add_top_tmp)
# date_query = False
if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
query_params.keys()
):
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
range_query = {
"range": {
"ts": {
"gt": from_ts,
"lt": to_ts,
}
}
}
add_top.append(range_query)
# if date_query:
# if settings.DELAY_RESULTS:
# if source not in settings.SAFE_SOURCES:
# if request.user.has_perm("core.bypass_delay"):
# add_top.append(range_query)
# else:
# delay_as_ts = datetime.now() - timedelta(
# days=settings.DELAY_DURATION
# )
# lt_as_ts = datetime.strptime(
# range_query["range"]["ts"]["lt"], "%Y-%m-%dT%H:%MZ"
# )
# if lt_as_ts > delay_as_ts:
# range_query["range"]["ts"][
# "lt"
# ] = f"now-{settings.DELAY_DURATION}d"
# add_top.append(range_query)
# else:
# add_top.append(range_query)
# else:
# if settings.DELAY_RESULTS:
# if source not in settings.SAFE_SOURCES:
# if not request.user.has_perm("core.bypass_delay"):
# range_query = {
# "range": {
# "ts": {
# # "gt": ,
# "lt": f"now-{settings.DELAY_DURATION}d",
# }
# }
# }
# add_top.append(range_query)
if "sorting" in query_params:
sorting = query_params["sorting"]
if sorting not in ("asc", "desc", "none"):
message = "Invalid sort"
message_class = "danger"
return {"message": message, "class": message_class}
if sorting in ("asc", "desc"):
sort = [
{
"ts": {
"order": sorting,
}
}
]
if "check_sentiment" in query_params:
if "sentiment_method" not in query_params:
message = "No sentiment method"
message_class = "danger"
return {"message": message, "class": message_class}
if "sentiment" in query_params:
sentiment = query_params["sentiment"]
try:
sentiment = float(sentiment)
except ValueError:
message = "Sentiment is not a float"
message_class = "danger"
return {"message": message, "class": message_class}
sentiment_method = query_params["sentiment_method"]
range_query_compare = {"range": {"sentiment": {}}}
range_query_precise = {
"match": {
"sentiment": None,
}
}
if sentiment_method == "below":
range_query_compare["range"]["sentiment"]["lt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "above":
range_query_compare["range"]["sentiment"]["gt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "exact":
range_query_precise["match"]["sentiment"] = sentiment
add_top.append(range_query_precise)
elif sentiment_method == "nonzero":
range_query_precise["match"]["sentiment"] = 0
add_top_negative.append(range_query_precise)
# Only one of query or query_full can be active at once
# We prefer query because it's simpler
if "query" in query_params:
query = query_params["query"]
search_query = self.construct_query(query, size, tokens=True)
query_created = True
elif "query_full" in query_params:
query_full = query_params["query_full"]
# if request.user.has_perm("core.query_search"):
search_query = self.construct_query(query_full, size)
query_created = True
# else:
# message = "You cannot search by query string"
# message_class = "danger"
# return {"message": message, "class": message_class}
else:
if custom_query:
search_query = custom_query
if tags:
# Get a blank search query
if not query_created:
search_query = self.construct_query(None, size, use_query_string=False)
query_created = True
for tagname, tagvalue in tags.items():
add_bool.append({tagname: tagvalue})
required_any = ["query_full", "query", "tags"]
if not any([field in query_params.keys() for field in required_any]):
if not custom_query:
message = "Empty query!"
message_class = "warning"
return {"message": message, "class": message_class}
if add_bool:
# if "bool" not in search_query["query"]:
# search_query["query"]["bool"] = {}
# if "must" not in search_query["query"]["bool"]:
# search_query["query"]["bool"] = {"must": []}
for item in add_bool:
search_query["query"]["bool"]["must"].append({"match_phrase": item})
if add_top:
for item in add_top:
search_query["query"]["bool"]["must"].append(item)
if add_top_negative:
for item in add_top_negative:
if "must_not" in search_query["query"]["bool"]:
search_query["query"]["bool"]["must_not"].append(item)
else:
search_query["query"]["bool"]["must_not"] = [item]
if sort:
search_query["sort"] = sort
if "index" in query_params:
index = query_params["index"]
if index == "main":
index = settings.INDEX_MAIN
else:
if not request.user.has_perm(f"core.index_{index}"):
message = "Not permitted to search by this index"
message_class = "danger"
return {
"message": message,
"class": message_class,
}
if index == "meta":
index = settings.INDEX_META
elif index == "internal":
index = settings.INDEX_INT
else:
message = "Index is not valid."
message_class = "danger"
return {
"message": message,
"class": message_class,
}
else:
index = settings.INDEX_MAIN
results = self.query(
request.user, # passed through run_main_query to filter_blacklisted
search_query,
custom_query=True,
index=index,
size=size,
)
if not results:
return False
if isinstance(results, Exception):
message = f"Error: {results.info['error']['root_cause'][0]['type']}"
message_class = "danger"
return {"message": message, "class": message_class}
if len(results["hits"]["hits"]) == 0:
message = "No results."
message_class = "danger"
return {"message": message, "class": message_class}
results_parsed = parse_results(results)
if annotate:
annotate_results(results_parsed)
if "dedup" in query_params:
if query_params["dedup"] == "on":
dedup = True
else:
dedup = False
else:
dedup = False
if reverse:
results_parsed = results_parsed[::-1]
if dedup:
if not dedup_fields:
dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
results_parsed = dedup_list(results_parsed, dedup_fields)
# if source not in settings.SAFE_SOURCES:
# if settings.ENCRYPTION:
# encrypt_list(request.user, results_parsed, settings.ENCRYPTION_KEY)
# if settings.HASHING:
# hash_list(request.user, results_parsed)
# if settings.OBFUSCATION:
# obfuscate_list(request.user, results_parsed)
# if settings.RANDOMISATION:
# randomise_list(request.user, results_parsed)
# process_list(results)
# IMPORTANT! - DO NOT PASS query_params to the user!
context = {
"object_list": results_parsed,
"card": results["hits"]["total"]["value"],
"took": results["took"],
}
if "redacted" in results:
context["redacted"] = results["redacted"]
if "exemption" in results:
context["exemption"] = results["exemption"]
if query:
context["query"] = query
# if settings.DELAY_RESULTS:
# if source not in settings.SAFE_SOURCES:
# if not request.user.has_perm("core.bypass_delay"):
# context["delay"] = settings.DELAY_DURATION
# if settings.RANDOMISATION:
# if source not in settings.SAFE_SOURCES:
# if not request.user.has_perm("core.bypass_randomisation"):
# context["randomised"] = True
return context
def query_single_result(self, request, query_params):
context = self.query_results(request, query_params, size=100)
if not context:
return {"message": "Failed to run query", "message_class": "danger"}
if "message" in context:
return context
dedup_set = {item["nick"] for item in context["object_list"]}
if dedup_set:
context["item"] = context["object_list"][0]
return context

View File

@ -60,59 +60,6 @@ def annotate_results(results_parsed):
item["num_chans"] = num_chans[item["nick"]] item["num_chans"] = num_chans[item["nick"]]
def filter_blacklisted(user, response):
"""
Low level filter to take the raw OpenSearch response and remove
objects from it we want to keep secret.
Does not return, the object is mutated in place.
"""
response["redacted"] = 0
response["exemption"] = None
if user.is_superuser:
response["exemption"] = True
# is_anonymous = isinstance(user, AnonymousUser)
# For every hit from ES
for index, item in enumerate(list(response["hits"]["hits"])):
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if "_source" in item.keys():
data_index = "_source"
elif "fields" in item.keys():
data_index = "fields"
else:
return False
if blacklisted_type in item[data_index].keys():
content = item[data_index][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item == str(content):
# Remove the item
if item in response["hits"]["hits"]:
# Let the UI know something was redacted
if (
"exemption"
not in response["hits"]["hits"][index][data_index]
):
response["redacted"] += 1
# Anonymous
if user.is_anonymous:
# Just set it to none so the index is not off
response["hits"]["hits"][index] = None
else:
if not user.has_perm("core.bypass_blacklist"):
response["hits"]["hits"][index] = None
else:
response["hits"]["hits"][index][data_index][
"exemption"
] = True
# Actually get rid of all the things we set to None
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
def parse_results(results): def parse_results(results):
results_parsed = [] results_parsed = []
stringify = ["host", "channel"] stringify = ["host", "channel"]
@ -166,3 +113,14 @@ def parse_results(results):
element["time"] = time element["time"] = time
results_parsed.append(element) results_parsed.append(element)
return results_parsed return results_parsed
def parse_druid(response):
results_parsed = []
for item in response:
if "events" in item:
for event in item["events"]:
results_parsed.append(event)
else:
raise Exception(f"events not in item {item}")
return results_parsed

21
core/db/storage.py Normal file
View File

@ -0,0 +1,21 @@
from django.conf import settings
def get_db():
if settings.DB_BACKEND == "DRUID":
from core.db.druid import DruidBackend
return DruidBackend()
elif settings.DB_BACKEND == "OPENSEARCH":
from core.db.opensearch import OpensearchBackend
return OpensearchBackend()
elif settings.DB_BACKEND == "MANTICORE":
from core.db.manticore import ManticoreBackend
return ManticoreBackend()
else:
raise Exception("Invalid DB backend")
db = get_db()

View File

@ -1,6 +1,5 @@
from django.conf import settings from django.conf import settings
from core.lib.opensearch import client, run_main_query
from core.lib.threshold import threshold_request from core.lib.threshold import threshold_request
@ -162,35 +161,6 @@ def construct_alert_query():
return query return query
def get_irc_alerts(user):
query = construct_alert_query()
results = run_main_query(
client,
user, # passed through run_main_query to filter_blacklisted
query,
custom_query=True,
index=settings.OPENSEARCH_INDEX_INT,
)
if not results:
return []
results_parsed = []
if "hits" in results.keys():
if "hits" in results["hits"]:
for item in results["hits"]["hits"]:
element = item["_source"]
element["id"] = item["_id"]
# Split the timestamp into date and time
ts = element["ts"]
ts_spl = ts.split("T")
date = ts_spl[0]
time = ts_spl[1]
element["date"] = date
element["time"] = time
results_parsed.append(element)
return results_parsed
def send_irc_message(net, num, channel, msg, nick=None): def send_irc_message(net, num, channel, msg, nick=None):
url = f"irc/msg/{net}/{num}" url = f"irc/msg/{net}/{num}"
payload = {"msg": msg, "channel": channel} payload = {"msg": msg, "channel": channel}

View File

@ -1,362 +0,0 @@
import logging
import random
import string
import time
from datetime import datetime
from math import floor, log10
from pprint import pprint
import manticoresearch
import requests
import ujson
from django.conf import settings
from siphashc import siphash
from core import r
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
from core.views import helpers
logger = logging.getLogger(__name__)
def initialise_manticore():
"""
Initialise the Manticore client
"""
configuration = manticoresearch.Configuration(host=settings.MANTICORE_URL)
api_client = manticoresearch.ApiClient(configuration)
api_instance = manticoresearch.SearchApi(api_client)
return (api_client, api_instance)
api_client, client = initialise_manticore()
def initialise_caching():
hash_key = r.get("cache_hash_key")
if not hash_key:
letters = string.ascii_lowercase
hash_key = "".join(random.choice(letters) for i in range(16))
logger.debug(f"Created new hash key: {hash_key}")
r.set("cache_hash_key", hash_key)
else:
hash_key = hash_key.decode("ascii")
logger.debug(f"Decoded hash key: {hash_key}")
return hash_key
hash_key = initialise_caching()
def construct_query(query, size, index, blank=False):
"""
Accept some query parameters and construct an OpenSearch query.
"""
if not size:
size = 5
query_base = {
"index": index,
"limit": size,
"query": {"bool": {"must": []}},
}
query_string = {
"query_string": query,
}
if not blank:
query_base["query"]["bool"]["must"].append(query_string)
return query_base
def run_query(client, user, search_query):
if settings.MANTICORE_CACHE:
start = time.process_time()
query_normalised = ujson.dumps(search_query, sort_keys=True)
hash = siphash(hash_key, query_normalised)
cache_hit = r.get(f"query_cache.{user.id}.{hash}")
if cache_hit:
response = ujson.loads(cache_hit)
time_took = (time.process_time() - start) * 1000
# Round to 3 significant figures
time_took_rounded = round(
time_took, 3 - int(floor(log10(abs(time_took)))) - 1
)
response["took"] = time_took_rounded
response["cache"] = True
return response
# response = client.search(search_query)
response = requests.post(f"{settings.MANTICORE_URL}/json/search", json=search_query)
response = ujson.loads(response.text)
if "error" in response and len(response.keys()) == 1:
return response
# response = response.to_dict()
#print("RESP", response)
if "took" in response:
if response["took"] is None:
return None
filter_blacklisted(user, response)
# Write cache
if settings.MANTICORE_CACHE:
to_write_cache = ujson.dumps(response)
r.set(f"query_cache.{user.id}.{hash}", to_write_cache)
r.expire(f"query_cache.{user.id}.{hash}", settings.MANTICORE_CACHE_TIMEOUT)
return response
def query_results(
request,
query_params,
size=None,
annotate=True,
custom_query=False,
reverse=False,
dedup=False,
dedup_fields=None,
tags=None,
):
query = None
message = None
message_class = None
add_bool = []
add_top = []
add_top_negative = []
sort = None
query_created = False
source = None
helpers.add_defaults(query_params)
# Check size
if request.user.is_anonymous:
sizes = settings.MANTICORE_MAIN_SIZES_ANON
else:
sizes = settings.MANTICORE_MAIN_SIZES
if not size:
if "size" in query_params:
size = query_params["size"]
if size not in sizes:
message = "Size is not permitted"
message_class = "danger"
return {"message": message, "class": message_class}
size = int(size)
else:
size = 20
# Check index
if "index" in query_params:
index = query_params["index"]
if index == "main":
index = settings.MANTICORE_INDEX_MAIN
else:
if not request.user.has_perm(f"core.index_{index}"):
message = "Not permitted to search by this index"
message_class = "danger"
return {
"message": message,
"class": message_class,
}
if index == "meta":
index = settings.MANTICORE_INDEX_META
elif index == "internal":
index = settings.MANTICORE_INDEX_INT
else:
message = "Index is not valid."
message_class = "danger"
return {
"message": message,
"class": message_class,
}
else:
index = settings.MANTICORE_INDEX_MAIN
# Create the search query
if "query" in query_params:
query = query_params["query"]
search_query = construct_query(query, size, index)
query_created = True
else:
if custom_query:
search_query = custom_query
if tags:
# Get a blank search query
if not query_created:
search_query = construct_query(None, size, index, blank=True)
query_created = True
for tagname, tagvalue in tags.items():
add_bool.append({tagname: tagvalue})
required_any = ["query_full", "query", "tags"]
if not any([field in query_params.keys() for field in required_any]):
if not custom_query:
message = "Empty query!"
message_class = "warning"
return {"message": message, "class": message_class}
# Check for a source
if "source" in query_params:
source = query_params["source"]
if source in settings.MANTICORE_SOURCES_RESTRICTED:
if not request.user.has_perm("core.restricted_sources"):
message = "Access denied"
message_class = "danger"
return {"message": message, "class": message_class}
elif source not in settings.MANTICORE_MAIN_SOURCES:
message = "Invalid source"
message_class = "danger"
return {"message": message, "class": message_class}
if source == "all":
source = None # the next block will populate it
if source:
sources = [source]
else:
sources = list(settings.MANTICORE_MAIN_SOURCES)
if request.user.has_perm("core.restricted_sources"):
for source_iter in settings.MANTICORE_SOURCES_RESTRICTED:
sources.append(source_iter)
add_top_tmp = {"bool": {"should": []}}
total_count = 0
for source_iter in sources:
add_top_tmp["bool"]["should"].append({"equals": {"src": source_iter}})
total_count += 1
total_sources = len(settings.MANTICORE_MAIN_SOURCES) + len(
settings.MANTICORE_SOURCES_RESTRICTED
)
if not total_count == total_sources:
add_top.append(add_top_tmp)
# Date/time range
if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
query_params.keys()
):
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
from_ts = datetime.strptime(from_ts, "%Y-%m-%dT%H:%MZ")
to_ts = datetime.strptime(to_ts, "%Y-%m-%dT%H:%MZ")
from_ts = int(from_ts.timestamp())
to_ts = int(to_ts.timestamp())
range_query = {
"range": {
"ts": {
"gt": from_ts,
"lt": to_ts,
}
}
}
add_top.append(range_query)
# Sorting
if "sorting" in query_params:
sorting = query_params["sorting"]
if sorting not in ("asc", "desc", "none"):
message = "Invalid sort"
message_class = "danger"
return {"message": message, "class": message_class}
if sorting in ("asc", "desc"):
sort = [
{
"ts": {
"order": sorting,
}
}
]
# Sentiment handling
if "check_sentiment" in query_params:
if "sentiment_method" not in query_params:
message = "No sentiment method"
message_class = "danger"
return {"message": message, "class": message_class}
if "sentiment" in query_params:
sentiment = query_params["sentiment"]
try:
sentiment = float(sentiment)
except ValueError:
message = "Sentiment is not a float"
message_class = "danger"
return {"message": message, "class": message_class}
sentiment_method = query_params["sentiment_method"]
range_query_compare = {"range": {"sentiment": {}}}
range_query_precise = {
"match": {
"sentiment": None,
}
}
if sentiment_method == "below":
range_query_compare["range"]["sentiment"]["lt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "above":
range_query_compare["range"]["sentiment"]["gt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "exact":
range_query_precise["match"]["sentiment"] = sentiment
add_top.append(range_query_precise)
elif sentiment_method == "nonzero":
range_query_precise["match"]["sentiment"] = 0
add_top_negative.append(range_query_precise)
if add_bool:
# if "bool" not in search_query["query"]:
# search_query["query"]["bool"] = {}
# if "must" not in search_query["query"]["bool"]:
# search_query["query"]["bool"] = {"must": []}
for item in add_bool:
search_query["query"]["bool"]["must"].append({"match": item})
if add_top:
for item in add_top:
search_query["query"]["bool"]["must"].append(item)
if add_top_negative:
for item in add_top_negative:
if "must_not" in search_query["query"]["bool"]:
search_query["query"]["bool"]["must_not"].append(item)
else:
search_query["query"]["bool"]["must_not"] = [item]
if sort:
search_query["sort"] = sort
pprint(search_query)
results = run_query(
client,
request.user, # passed through run_main_query to filter_blacklisted
search_query,
)
if not results:
message = "Error running query"
message_class = "danger"
return {"message": message, "class": message_class}
# results = results.to_dict()
if "error" in results:
message = results["error"]
message_class = "danger"
return {"message": message, "class": message_class}
results_parsed = parse_results(results)
if annotate:
annotate_results(results_parsed)
if "dedup" in query_params:
if query_params["dedup"] == "on":
dedup = True
else:
dedup = False
else:
dedup = False
if reverse:
results_parsed = results_parsed[::-1]
if dedup:
if not dedup_fields:
dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
results_parsed = helpers.dedup_list(results_parsed, dedup_fields)
context = {
"object_list": results_parsed,
"card": results["hits"]["total"],
"took": results["took"],
}
if "cache" in results:
context["cache"] = results["cache"]
return context

View File

@ -3,7 +3,7 @@ from math import ceil
from django.conf import settings from django.conf import settings
from numpy import array_split from numpy import array_split
from core.lib.opensearch import client, run_main_query from core.db.opensearch import client, run_main_query
def construct_query(net, nicks): def construct_query(net, nicks):

View File

@ -3,7 +3,7 @@ from math import ceil
from django.conf import settings from django.conf import settings
from numpy import array_split from numpy import array_split
from core.lib.opensearch import client, run_main_query from core.lib.druid import client, run_main_query
def construct_query(net, nicks): def construct_query(net, nicks):

View File

@ -1,487 +0,0 @@
# from copy import deepcopy
# from datetime import datetime, timedelta
from django.conf import settings
from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError, RequestError
# from json import dumps
# pp = lambda x: print(dumps(x, indent=2))
from core.lib.processing import annotate_results, filter_blacklisted, parse_results
from core.views.helpers import dedup_list
def initialise_opensearch():
"""
Inititialise the OpenSearch API endpoint.
"""
auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD)
client = OpenSearch(
# fmt: off
hosts=[{"host": settings.OPENSEARCH_URL,
"port": settings.OPENSEARCH_PORT}],
http_compress=False, # enables gzip compression for request bodies
http_auth=auth,
# client_cert = client_cert_path,
# client_key = client_key_path,
use_ssl=settings.OPENSEARCH_TLS,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
# a_certs=ca_certs_path,
)
return client
client = initialise_opensearch()
def construct_query(query, size, use_query_string=True, tokens=False):
"""
Accept some query parameters and construct an OpenSearch query.
"""
if not size:
size = 5
query_base = {
"size": size,
"query": {"bool": {"must": []}},
}
query_string = {
"query_string": {
"query": query,
# "fields": fields,
# "default_field": "msg",
# "type": "best_fields",
"fuzziness": "AUTO",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
# "minimum_should_match": 1,
"default_operator": "or",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"allow_leading_wildcard": True,
# "enable_position_increments": False,
"phrase_slop": 3,
# "max_determinized_states": 10000,
"quote_field_suffix": "",
"quote_analyzer": "standard",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": True,
}
}
query_tokens = {
"simple_query_string": {
# "tokens": query,
"query": query,
"fields": ["tokens"],
"flags": "ALL",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
"default_operator": "and",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"quote_field_suffix": "",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": False,
}
}
if tokens:
query_base["query"]["bool"]["must"].append(query_tokens)
# query["query"]["bool"]["must"].append(query_string)
# query["query"]["bool"]["must"][0]["query_string"]["fields"] = ["tokens"]
elif use_query_string:
query_base["query"]["bool"]["must"].append(query_string)
return query_base
def run_main_query(client, user, query, custom_query=False, index=None, size=None):
"""
Low level helper to run an ES query.
Accept a user to pass it to the filter, so we can
avoid filtering for superusers.
Accept fields and size, for the fields we want to match and the
number of results to return.
"""
if not index:
index = settings.OPENSEARCH_INDEX_MAIN
if custom_query:
search_query = query
else:
search_query = construct_query(query, size)
try:
response = client.search(body=search_query, index=index)
except RequestError as err:
print("OpenSearch error", err)
return err
except NotFoundError as err:
print("OpenSearch error", err)
return err
filter_blacklisted(user, response)
return response
def query_results(
request,
query_params,
size=None,
annotate=True,
custom_query=False,
reverse=False,
dedup=False,
dedup_fields=None,
lookup_hashes=True,
tags=None,
):
"""
API helper to alter the OpenSearch return format into something
a bit better to parse.
Accept a HTTP request object. Run the query, and annotate the
results with the other data we have.
"""
# is_anonymous = isinstance(request.user, AnonymousUser)
query = None
message = None
message_class = None
add_bool = []
add_top = []
add_top_negative = []
sort = None
query_created = False
# Lookup the hash values but don't disclose them to the user
# denied = []
# if lookup_hashes:
# if settings.HASHING:
# query_params = deepcopy(query_params)
# denied_q = hash_lookup(request.user, query_params)
# denied.extend(denied_q)
# if tags:
# denied_t = hash_lookup(request.user, tags, query_params)
# denied.extend(denied_t)
# message = "Permission denied: "
# for x in denied:
# if isinstance(x, SearchDenied):
# message += f"Search({x.key}: {x.value}) "
# elif isinstance(x, LookupDenied):
# message += f"Lookup({x.key}: {x.value}) "
# if denied:
# # message = [f"{i}" for i in message]
# # message = "\n".join(message)
# message_class = "danger"
# return {"message": message, "class": message_class}
if request.user.is_anonymous:
sizes = settings.OPENSEARCH_MAIN_SIZES_ANON
else:
sizes = settings.OPENSEARCH_MAIN_SIZES
if not size:
if "size" in query_params:
size = query_params["size"]
if size not in sizes:
message = "Size is not permitted"
message_class = "danger"
return {"message": message, "class": message_class}
else:
size = 20
source = None
if "source" in query_params:
source = query_params["source"]
if source in settings.OPENSEARCH_SOURCES_RESTRICTED:
if not request.user.has_perm("core.restricted_sources"):
message = "Access denied"
message_class = "danger"
return {"message": message, "class": message_class}
elif source not in settings.OPENSEARCH_MAIN_SOURCES:
message = "Invalid source"
message_class = "danger"
return {"message": message, "class": message_class}
if source == "all":
source = None # the next block will populate it
if source:
sources = [source]
else:
sources = settings.OPENSEARCH_MAIN_SOURCES
if request.user.has_perm("core.restricted_sources"):
for source_iter in settings.OPENSEARCH_SOURCES_RESTRICTED:
sources.append(source_iter)
add_top_tmp = {"bool": {"should": []}}
for source_iter in sources:
add_top_tmp["bool"]["should"].append({"match_phrase": {"src": source_iter}})
add_top.append(add_top_tmp)
# date_query = False
if set({"from_date", "to_date", "from_time", "to_time"}).issubset(
query_params.keys()
):
from_ts = f"{query_params['from_date']}T{query_params['from_time']}Z"
to_ts = f"{query_params['to_date']}T{query_params['to_time']}Z"
range_query = {
"range": {
"ts": {
"gt": from_ts,
"lt": to_ts,
}
}
}
add_top.append(range_query)
# if date_query:
# if settings.DELAY_RESULTS:
# if source not in settings.SAFE_SOURCES:
# if request.user.has_perm("core.bypass_delay"):
# add_top.append(range_query)
# else:
# delay_as_ts = datetime.now() - timedelta(
# days=settings.DELAY_DURATION
# )
# lt_as_ts = datetime.strptime(
# range_query["range"]["ts"]["lt"], "%Y-%m-%dT%H:%MZ"
# )
# if lt_as_ts > delay_as_ts:
# range_query["range"]["ts"][
# "lt"
# ] = f"now-{settings.DELAY_DURATION}d"
# add_top.append(range_query)
# else:
# add_top.append(range_query)
# else:
# if settings.DELAY_RESULTS:
# if source not in settings.SAFE_SOURCES:
# if not request.user.has_perm("core.bypass_delay"):
# range_query = {
# "range": {
# "ts": {
# # "gt": ,
# "lt": f"now-{settings.DELAY_DURATION}d",
# }
# }
# }
# add_top.append(range_query)
if "sorting" in query_params:
sorting = query_params["sorting"]
if sorting not in ("asc", "desc", "none"):
message = "Invalid sort"
message_class = "danger"
return {"message": message, "class": message_class}
if sorting in ("asc", "desc"):
sort = [
{
"ts": {
"order": sorting,
}
}
]
if "check_sentiment" in query_params:
if "sentiment_method" not in query_params:
message = "No sentiment method"
message_class = "danger"
return {"message": message, "class": message_class}
if "sentiment" in query_params:
sentiment = query_params["sentiment"]
try:
sentiment = float(sentiment)
except ValueError:
message = "Sentiment is not a float"
message_class = "danger"
return {"message": message, "class": message_class}
sentiment_method = query_params["sentiment_method"]
range_query_compare = {"range": {"sentiment": {}}}
range_query_precise = {
"match": {
"sentiment": None,
}
}
if sentiment_method == "below":
range_query_compare["range"]["sentiment"]["lt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "above":
range_query_compare["range"]["sentiment"]["gt"] = sentiment
add_top.append(range_query_compare)
elif sentiment_method == "exact":
range_query_precise["match"]["sentiment"] = sentiment
add_top.append(range_query_precise)
elif sentiment_method == "nonzero":
range_query_precise["match"]["sentiment"] = 0
add_top_negative.append(range_query_precise)
# Only one of query or query_full can be active at once
# We prefer query because it's simpler
if "query" in query_params:
query = query_params["query"]
search_query = construct_query(query, size, tokens=True)
query_created = True
elif "query_full" in query_params:
query_full = query_params["query_full"]
# if request.user.has_perm("core.query_search"):
search_query = construct_query(query_full, size)
query_created = True
# else:
# message = "You cannot search by query string"
# message_class = "danger"
# return {"message": message, "class": message_class}
else:
if custom_query:
search_query = custom_query
if tags:
# Get a blank search query
if not query_created:
search_query = construct_query(None, size, use_query_string=False)
query_created = True
for tagname, tagvalue in tags.items():
add_bool.append({tagname: tagvalue})
required_any = ["query_full", "query", "tags"]
if not any([field in query_params.keys() for field in required_any]):
if not custom_query:
message = "Empty query!"
message_class = "warning"
return {"message": message, "class": message_class}
if add_bool:
# if "bool" not in search_query["query"]:
# search_query["query"]["bool"] = {}
# if "must" not in search_query["query"]["bool"]:
# search_query["query"]["bool"] = {"must": []}
for item in add_bool:
search_query["query"]["bool"]["must"].append({"match_phrase": item})
if add_top:
for item in add_top:
search_query["query"]["bool"]["must"].append(item)
if add_top_negative:
for item in add_top_negative:
if "must_not" in search_query["query"]["bool"]:
search_query["query"]["bool"]["must_not"].append(item)
else:
search_query["query"]["bool"]["must_not"] = [item]
if sort:
search_query["sort"] = sort
if "index" in query_params:
index = query_params["index"]
if index == "main":
index = settings.OPENSEARCH_INDEX_MAIN
else:
if not request.user.has_perm(f"core.index_{index}"):
message = "Not permitted to search by this index"
message_class = "danger"
return {
"message": message,
"class": message_class,
}
if index == "meta":
index = settings.OPENSEARCH_INDEX_META
elif index == "internal":
index = settings.OPENSEARCH_INDEX_INT
else:
message = "Index is not valid."
message_class = "danger"
return {
"message": message,
"class": message_class,
}
else:
index = settings.OPENSEARCH_INDEX_MAIN
results = run_main_query(
client,
request.user, # passed through run_main_query to filter_blacklisted
search_query,
custom_query=True,
index=index,
size=size,
)
if not results:
return False
if isinstance(results, Exception):
message = f"Error: {results.info['error']['root_cause'][0]['type']}"
message_class = "danger"
return {"message": message, "class": message_class}
if len(results["hits"]["hits"]) == 0:
message = "No results."
message_class = "danger"
return {"message": message, "class": message_class}
results_parsed = parse_results(results)
if annotate:
annotate_results(results_parsed)
if "dedup" in query_params:
if query_params["dedup"] == "on":
dedup = True
else:
dedup = False
else:
dedup = False
if reverse:
results_parsed = results_parsed[::-1]
if dedup:
if not dedup_fields:
dedup_fields = ["msg", "nick", "ident", "host", "net", "channel"]
results_parsed = dedup_list(results_parsed, dedup_fields)
# if source not in settings.SAFE_SOURCES:
# if settings.ENCRYPTION:
# encrypt_list(request.user, results_parsed, settings.ENCRYPTION_KEY)
# if settings.HASHING:
# hash_list(request.user, results_parsed)
# if settings.OBFUSCATION:
# obfuscate_list(request.user, results_parsed)
# if settings.RANDOMISATION:
# randomise_list(request.user, results_parsed)
# process_list(results)
# IMPORTANT! - DO NOT PASS query_params to the user!
context = {
"object_list": results_parsed,
"card": results["hits"]["total"]["value"],
"took": results["took"],
}
if "redacted" in results:
context["redacted"] = results["redacted"]
if "exemption" in results:
context["exemption"] = results["exemption"]
if query:
context["query"] = query
# if settings.DELAY_RESULTS:
# if source not in settings.SAFE_SOURCES:
# if not request.user.has_perm("core.bypass_delay"):
# context["delay"] = settings.DELAY_DURATION
# if settings.RANDOMISATION:
# if source not in settings.SAFE_SOURCES:
# if not request.user.has_perm("core.bypass_randomisation"):
# context["randomised"] = True
return context
def query_single_result(request, query_params):
context = query_results(request, query_params, size=100)
if not context:
return {"message": "Failed to run query", "message_class": "danger"}
if "message" in context:
return context
dedup_set = {item["nick"] for item in context["object_list"]}
if dedup_set:
context["item"] = context["object_list"][0]
return context

View File

@ -257,7 +257,7 @@
{% endif %} {% endif %}
{% if perms.core.use_insights %} {% if perms.core.use_insights %}
<a class="navbar-item" href="{% url 'insights' %}"> <a class="navbar-item" href="{# url 'insights' #}">
Insights Insights
</a> </a>
{% endif %} {% endif %}

0
core/util/__init__.py Normal file
View File

69
core/util/logs.py Normal file
View File

@ -0,0 +1,69 @@
# Other library imports
import logging
log = logging.getLogger("util")
debug = True
# Color definitions
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
COLORS = {
"WARNING": YELLOW,
"INFO": WHITE,
"DEBUG": BLUE,
"CRITICAL": YELLOW,
"ERROR": RED,
}
RESET_SEQ = "\033[0m"
COLOR_SEQ = "\033[1;%dm"
BOLD_SEQ = "\033[1m"
def formatter_message(message, use_color=True):
if use_color:
message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
else:
message = message.replace("$RESET", "").replace("$BOLD", "")
return message
class ColoredFormatter(logging.Formatter):
def __init__(self, msg, use_color=True):
logging.Formatter.__init__(self, msg)
self.use_color = use_color
def format(self, record):
levelname = record.levelname
if self.use_color and levelname in COLORS:
levelname_color = (
COLOR_SEQ % (30 + COLORS[levelname]) + levelname + RESET_SEQ
)
record.levelname = levelname_color
return logging.Formatter.format(self, record)
def get_logger(name):
# Define the logging format
FORMAT = "%(asctime)s %(levelname)18s $BOLD%(name)13s$RESET - %(message)s"
COLOR_FORMAT = formatter_message(FORMAT, True)
color_formatter = ColoredFormatter(COLOR_FORMAT)
# formatter = logging.Formatter(
# Why is this so complicated?
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# ch.setFormatter(formatter)
ch.setFormatter(color_formatter)
# Define the logger on the base class
log = logging.getLogger(name)
log.setLevel(logging.INFO)
if debug:
log.setLevel(logging.DEBUG)
ch.setLevel(logging.DEBUG)
# Add the handler and stop it being silly and printing everything twice
log.addHandler(ch)
log.propagate = False
return log

View File

@ -11,10 +11,8 @@ from django_tables2 import SingleTableView
from rest_framework.parsers import FormParser from rest_framework.parsers import FormParser
from rest_framework.views import APIView from rest_framework.views import APIView
from core.db.storage import db
from core.lib.context import construct_query from core.lib.context import construct_query
# from core.lib.opensearch import query_results
from core.lib.manticore import query_results
from core.lib.threshold import ( from core.lib.threshold import (
annotate_num_chans, annotate_num_chans,
annotate_num_users, annotate_num_users,
@ -87,7 +85,7 @@ def make_graph(results):
date = str(index) date = str(index)
graph.append( graph.append(
{ {
"text": item.get("tokens", None) "text": item.get("words_noun", None)
or item.get("msg", None) or item.get("msg", None)
or item.get("id"), or item.get("id"),
"nick": item.get("nick", None), "nick": item.get("nick", None),
@ -108,9 +106,9 @@ def drilldown_search(request, return_context=False, template=None):
else: else:
template_name = template template_name = template
if request.user.is_anonymous: if request.user.is_anonymous:
sizes = settings.MANTICORE_MAIN_SIZES_ANON sizes = settings.MAIN_SIZES_ANON
else: else:
sizes = settings.MANTICORE_MAIN_SIZES sizes = settings.MAIN_SIZES
if request.GET: if request.GET:
if not request.htmx: if not request.htmx:
@ -165,7 +163,7 @@ def drilldown_search(request, return_context=False, template=None):
tags = parse_tags(query_params["tags"]) tags = parse_tags(query_params["tags"])
extra_params["tags"] = tags extra_params["tags"] = tags
context = query_results(request, query_params, **extra_params) context = db.query_results(request, query_params, **extra_params)
context["unique"] = "results" context["unique"] = "results"
# Valid sizes # Valid sizes
@ -375,7 +373,7 @@ class DrilldownContextModal(APIView):
type=type, type=type,
nicks=nicks_sensitive, nicks=nicks_sensitive,
) )
results = query_results( results = db.query_results(
request, request,
query_params, query_params,
size=size, size=size,

View File

@ -7,9 +7,9 @@ from django.views import View
from rest_framework.parsers import FormParser from rest_framework.parsers import FormParser
from rest_framework.views import APIView from rest_framework.views import APIView
from core.db.druid import query_single_result
from core.lib.meta import get_meta from core.lib.meta import get_meta
from core.lib.nicktrace import get_nicks from core.lib.nicktrace import get_nicks
from core.lib.opensearch import query_single_result
from core.lib.threshold import ( from core.lib.threshold import (
annotate_num_chans, annotate_num_chans,
annotate_num_users, annotate_num_users,

View File

@ -65,7 +65,13 @@ class DrilldownTable(Table):
realname = Column() realname = Column()
server = Column() server = Column()
mtype = Column() mtype = Column()
tokens = Column() # tokens = Column()
lang_code = Column()
lang_name = Column()
words_noun = Column()
words_adj = Column()
words_verb = Column()
words_adv = Column()
hidden = Column() hidden = Column()
filename = Column() filename = Column()
file_md5 = Column() file_md5 = Column()

View File

@ -14,9 +14,6 @@ cryptography
siphashc siphashc
redis redis
sortedcontainers sortedcontainers
#manticoresearch
django-debug-toolbar django-debug-toolbar
django-debug-toolbar-template-profiler django-debug-toolbar-template-profiler
ujson
orjson orjson
pydruid

View File

@ -13,9 +13,6 @@ cryptography
siphashc siphashc
redis redis
sortedcontainers sortedcontainers
#manticoresearch
django-debug-toolbar django-debug-toolbar
django-debug-toolbar-template-profiler django-debug-toolbar-template-profiler
ujson
orjson orjson
pydruid

View File

@ -14,9 +14,6 @@ cryptography
siphashc siphashc
redis redis
sortedcontainers sortedcontainers
#manticoresearch
django-debug-toolbar django-debug-toolbar
django-debug-toolbar-template-profiler django-debug-toolbar-template-profiler
ujson
orjson orjson
pydruid