Improve data security by mandating token search

master
Mark Veidemanis 2 years ago
parent e85fa910aa
commit 3f02c61463
Signed by: m
GPG Key ID: 5ACFCEED46C0904F

@ -16,6 +16,45 @@ OPENSEARCH_MAIN_WILDCARD_ANON = False
OPENSEARCH_MAIN_SOURCES = ["irc", "dis", "all"]
DRILLDOWN_RESULTS_PER_PAGE = 15
# Encryption
ENCRYPTION = False
ENCRYPTION_KEY = b""
# Hashing
HASHING = True
HASHING_KEY = "xxx"
# Common to encryption and hashing
WHITELIST_FIELDS = [
"ts",
"date",
"time",
"sentiment",
"version_sentiment",
"tokens",
"num_chans",
"num_users",
"tokens",
"src",
"exemption",
"hidden",
]
# Don't obfuscate these parameters, or lookup hashes in them
NO_OBFUSCATE_PARAMS = [
"query",
"query_full",
"size",
"source",
"sorting",
"tags",
"index",
"dedup",
"check_sentiment",
"sentiment_method",
"dates",
]
OPENSEARCH_BLACKLISTED = {}
# URLs

@ -5,7 +5,16 @@ from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError, RequestError
from core.lib.threshold import annotate_num_chans, annotate_num_users, annotate_online
from core.views.helpers import dedup_list, encrypt_list, hash_list, hash_lookup
from core.views.helpers import (
SearchDenied,
dedup_list,
encrypt_list,
hash_list,
hash_lookup,
)
# from json import dumps
# pp = lambda x: print(dumps(x, indent=2))
def initialise_opensearch():
@ -141,47 +150,66 @@ def filter_blacklisted(user, response):
response["hits"]["hits"] = [hit for hit in response["hits"]["hits"] if hit]
def construct_query(query, size):
def construct_query(query, size, use_query_string=True, tokens=False):
"""
Accept some query parameters and construct an OpenSearch query.
"""
if not size:
size = 5
query = {
query_base = {
"size": size,
"query": {
"bool": {
"must": [
{
"query_string": {
"query": query,
# "fields": fields,
# "default_field": "msg",
# "type": "best_fields",
"fuzziness": "AUTO",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
# "minimum_should_match": 1,
"default_operator": "or",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"allow_leading_wildcard": True,
# "enable_position_increments": False,
"phrase_slop": 3,
# "max_determinized_states": 10000,
"quote_field_suffix": "",
"quote_analyzer": "standard",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": True,
}
}
]
}
},
"query": {"bool": {"must": []}},
}
query_string = {
"query_string": {
"query": query,
# "fields": fields,
# "default_field": "msg",
# "type": "best_fields",
"fuzziness": "AUTO",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
# "minimum_should_match": 1,
"default_operator": "or",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"allow_leading_wildcard": True,
# "enable_position_increments": False,
"phrase_slop": 3,
# "max_determinized_states": 10000,
"quote_field_suffix": "",
"quote_analyzer": "standard",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": True,
}
}
query_tokens = {
"simple_query_string": {
# "tokens": query,
"query": query,
"fields": ["tokens"],
"flags": "ALL",
"fuzzy_transpositions": True,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
"default_operator": "and",
"analyzer": "standard",
"lenient": True,
"boost": 1,
"quote_field_suffix": "",
"analyze_wildcard": False,
"auto_generate_synonyms_phrase_query": False,
}
}
return query
if tokens:
query_base["query"]["bool"]["must"].append(query_tokens)
# query["query"]["bool"]["must"].append(query_string)
# query["query"]["bool"]["must"][0]["query_string"]["fields"] = ["tokens"]
elif use_query_string:
query_base["query"]["bool"]["must"].append(query_string)
return query_base
def run_main_query(client, user, query, custom_query=False, index=None, size=None):
@ -261,6 +289,7 @@ def query_results(
dedup=False,
dedup_fields=None,
lookup_hashes=True,
tags=None,
):
"""
API helper to alter the OpenSearch return format into something
@ -276,12 +305,15 @@ def query_results(
add_top = []
add_top_negative = []
sort = None
query_created = False
# Lookup the hash values but don't disclose them to the user
if lookup_hashes:
if settings.HASHING:
query_params = deepcopy(query_params)
hash_lookup(query_params)
hash_lookup(request.user, query_params)
if tags:
hash_lookup(request.user, tags)
if request.user.is_anonymous:
sizes = settings.OPENSEARCH_MAIN_SIZES_ANON
@ -366,15 +398,53 @@ def query_results(
range_query_precise["match"]["sentiment"] = 0
add_top_negative.append(range_query_precise)
# Only one of query or query_full can be active at once
# We prefer query because it's simpler
if "query" in query_params:
query = query_params["query"]
search_query = construct_query(query, size)
search_query = construct_query(query, size, tokens=True)
query_created = True
elif "query_full" in query_params:
query_full = query_params["query_full"]
if request.user.has_perm("query_search"):
search_query = construct_query(query_full, size)
query_created = True
else:
message = "You cannot search by query string"
message_class = "danger"
return {"message": message, "class": message_class}
else:
if custom_query:
search_query = custom_query
if tags:
# Get a blank search query
if not query_created:
search_query = construct_query(None, size, use_query_string=False)
query_created = True
for tagname, tagvalue in tags.items():
add_bool.append({tagname: tagvalue})
required_any = ["query_full", "query", "tags"]
if not any([field in query_params.keys() for field in required_any]):
if not custom_query:
message = "Empty query!"
message_class = "warning"
return {"message": message, "class": message_class}
if add_bool:
# if "bool" not in search_query["query"]:
# search_query["query"]["bool"] = {}
# if "must" not in search_query["query"]["bool"]:
# search_query["query"]["bool"] = {"must": []}
for item in add_bool:
search_query["query"]["bool"]["must"].append({"match": item})
k, v = list(item.items())[0]
if isinstance(v, SearchDenied):
message = f"Access denied: search by protected field {k}: {v.value}"
message_class = "danger"
return {"message": message, "class": message_class}
search_query["query"]["bool"]["must"].append({"match_phrase": item})
if add_top:
for item in add_top:
search_query["query"]["bool"]["must"].append(item)
@ -398,7 +468,6 @@ def query_results(
return {
"message": message,
"class": message_class,
"params": query_params,
}
if index == "meta":
index = settings.OPENSEARCH_INDEX_META
@ -410,7 +479,6 @@ def query_results(
return {
"message": message,
"class": message_class,
"params": query_params,
}
else:
@ -461,7 +529,6 @@ def query_results(
if not request.user.has_perm("view_plain"):
if settings.HASHING:
hash_list(request.user, results_parsed)
# process_list(reqults)
# IMPORTANT! - DO NOT PASS query_params to the user!

File diff suppressed because one or more lines are too long

@ -15,6 +15,7 @@
<link rel="stylesheet" href="{% static 'css/bulma-slider.min.css' %}">
<link rel="stylesheet" href="{% static 'css/bulma-calendar.min.css' %}">
<link rel="stylesheet" href="{% static 'css/bulma-tagsinput.min.css' %}">
<link rel="stylesheet" href="{% static 'css/bulma-switch.min.css' %}">
<script src="{% static 'js/bulma-calendar.min.js' %}" integrity="sha384-DThNif0xGXbopX7+PE+UabkuClfI/zELNhaVqoGLutaWB76dyMw0vIQBGmUxSfVQ" crossorigin="anonymous"></script>
<script src="{% static 'js/bulma-slider.min.js' %}" integrity="sha384-wbyps8iLG8QzJE02viYc/27BtT5HSa11+b5V7QPR1/huVuA8f4LRTNGc82qAIeIZ" crossorigin="anonymous"></script>
<script defer src="{% static 'js/htmx.min.js' %}" integrity="sha384-cZuAZ+ZbwkNRnrKi05G/fjBX+azI9DNOkNYysZ0I/X5ZFgsmMiBXgDZof30F5ofc" crossorigin="anonymous"></script>

@ -39,60 +39,23 @@
} catch {
var value = spl[1];
}
populateSearch(field, value);
return `${field}: ${value}`;
});
inputTags.BulmaTagsInput().on('after.remove', function(item) {
var spl = item.split(": ");
var field = spl[0];
var value = spl[1].trim();
populateSearch(field, value);
});
}
function populateSearch(field, value) {
var queryElement = document.getElementById('query');
var present = true;
if (present == true) {
var combinations = [`${field}: "${value}"`,
`${field}: "${value}"`,
`${field}: ${value}`,
`${field}:${value}`,
`${field}:"${value}"`];
var toAppend = ` AND ${field}: "${value}"`;
} else {
var combinations = [`NOT ${field}: "${value}"`,
`NOT ${field}: "${value}"`,
`NOT ${field}: ${value}`,
`NOT ${field}:${value}`,
`NOT ${field}:"${value}"`];
}
var contains = combinations.some(elem => queryElement.value.includes(elem));
if (!contains) {
queryElement.value+=toAppend;
} else {
for (var index in combinations) {
combination = combinations[index];
queryElement.value = queryElement.value.replaceAll("AND "+combination, "");
queryElement.value = queryElement.value.replaceAll(combination, "");
}
}
if (field == "src") {
document.getElementById("source").selectedIndex = 2;
}
if (queryElement.value.startsWith(" AND ")) {
queryElement.value = queryElement.value.replace(" AND ", "");
}
if (queryElement.value.startsWith("AND ")) {
queryElement.value = queryElement.value.replace("AND ", "");
}
var inputTags = document.getElementById('tags');
inputTags.BulmaTagsInput().add(field+": "+value);
htmx.trigger("#search", "click");
}
</script>
<div>
{% include 'partials/notify.html' %}
<form method="POST" hx-post="{% url 'search' %}"
<form class="skipEmptyFields" method="POST" hx-post="{% url 'search' %}"
hx-trigger="change"
hx-target="#results"
hx-swap="innerHTML"
@ -102,12 +65,17 @@
<div class="columns">
<div class="column">
<div class="field has-addons">
<div class="control is-expanded has-icons-left">
<div id="query" class="control is-expanded has-icons-left">
<input
hx-post="{% url 'search' %}"
hx-trigger="keyup changed delay:200ms"
hx-target="#results"
hx-swap="innerHTML" id="query" name="query" value="{{ params.query }}" class="input" type="text" placeholder="msg: science AND nick: BillNye AND channel: #science">
hx-swap="innerHTML"
name="query"
value="{{ params.query }}"
class="input"
type="text"
placeholder="(science | tech | art) + (interest) -hello">
<span class="icon is-small is-left">
<i class="fas fa-magnifying-glass"></i>
</span>
@ -166,6 +134,19 @@
</a>
</p>
</div>
<div class="control">
<div class="field">
<input
id="full_query"
type="checkbox"
class="switch"
{% if params.query_full is not None %}checked="checked"{% else %}none{% endif %}
data-script="on click toggle .is-hidden on #query_full">
<label
class="{% if not perms.core.query_search %}is-disabled{% endif %}"
for="full_query">Full query </label>
</div>
</div>
</div>
<div class="column is-narrow">
<div class="field has-addons block">
@ -411,24 +392,45 @@
</div>
</div>
</div>
<div id="query_full" class="block {% if params.query_full is None %}is-hidden{% endif %}">
<div class="control is-expanded has-icons-left">
<input
hx-post="{% url 'search' %}"
hx-trigger="keyup changed delay:200ms"
hx-target="#results"
hx-swap="innerHTML"
name="query_full"
value="{{ params.query_full }}"
class="input"
type="text"
placeholder="msg: science AND nick: BillNye AND channel: #science">
<span class="icon is-small is-left">
<i class="fas fa-magnifying-glass"></i>
</span>
</div>
</div>
<div class="block">
<input
hx-trigger="change"
hx-post="{% url 'search' %}"
hx-target="#results"
hx-swap="innerHTML"
id="tags"
class="input"
type="tags"
name="tags"
placeholder="Add tags"
value="{{ params.tags }}">
</div>
<div class="is-hidden"></div>
</form>
</div>
<div class="block">
<input id="tags" class="input" type="tags" placeholder="Add query" value="{{ tags|joinsep:',' }}">
</div>
<div class="block">
<div id="results">
<!-- {% if results %}
{% include 'ui/drilldown/results.html' %}
{% endif %} -->
{% if table %}
{% include 'ui/drilldown/table_results.html' %}
{% else %}
<script>
setupTags();
</script>
{% endif %}
{% include 'ui/drilldown/table_results.html' %}
<script>
setupTags();
</script>
</div>
</div>
<div id="modals-here">

@ -37,23 +37,3 @@
{% endif %}
{% include 'ui/drilldown/table_results_partial.html' %}
{% endif %}
{# Update the tags in case the user changed the query #}
{# Check for focus and refocus #}
<script>
var inputTags = document.getElementsByClassName('tags-input');
var inputBox = document.querySelector("[placeholder='Add query']");
var isFocused = (document.activeElement === inputBox);
for (index = 0; index < inputTags.length; index++) {
if (index == 0) {
inputTags[0].outerHTML = '<input id="tags" class="input" type="tags" placeholder="Add query" value="{{ tags|joinsep:',' }}">';
} else {
inputTags[index].remove();
}
}
// inputTags[0].outerHTML = '<input id="tags" class="input" type="tags" placeholder="Add query" value="{{ tags|joinsep:',' }}">';
setupTags();
var inputBox = document.querySelector("[placeholder='Add query']");
if (isFocused) {
inputBox.focus();
}
</script>

@ -143,7 +143,7 @@
</span>
</td>
{% elif column.name == 'tokens' %}
<td class="{{ column.name }}">
<td class="{{ column.name }} wrap" style="max-width: 10em">
{{ cell|joinsep:',' }}
</td>
{% elif column.name == 'src' %}

@ -10,6 +10,11 @@ from sortedcontainers import SortedSet
from core import r
class SearchDenied:
def __init__(self, value):
self.value = value
def dedup_list(data, check_keys):
"""
Remove duplicate dictionaries from list.
@ -90,7 +95,10 @@ def hash_list(user, data, hash_keys=False):
for index, item in enumerate(data_copy):
if isinstance(item, dict):
for key, value in list(item.items()):
if key not in settings.WHITELIST_FIELDS:
if (
key not in settings.WHITELIST_FIELDS
and key not in settings.NO_OBFUSCATE_PARAMS
):
if isinstance(value, int):
value = str(value)
if isinstance(value, bool):
@ -122,18 +130,35 @@ def hash_list(user, data, hash_keys=False):
r.hmset(cache, hash_table)
def hash_lookup(data_dict):
def hash_lookup(user, data_dict):
cache = "cache.hash"
hash_list = SortedSet()
for key, value in data_dict.items():
if not value:
continue
# hashes = re.findall("\|([^\|]*)\|", value) # noqa
hashes = re.findall("[A-Z0-9]{12,13}", value)
if not hashes:
continue
for hash in hashes:
hash_list.add(hash)
for key, value in list(data_dict.items()):
if (
key not in settings.WHITELIST_FIELDS
and key not in settings.NO_OBFUSCATE_PARAMS
):
if not value:
continue
# hashes = re.findall("\|([^\|]*)\|", value) # noqa
if isinstance(value, str):
hashes = re.findall("[A-Z0-9]{12,13}", value)
elif isinstance(value, dict):
hashes = []
for key, value in value.items():
if not value:
continue
hashes_iter = re.findall("[A-Z0-9]{12,13}", value)
for h in hashes_iter:
hashes.append(h)
if not hashes:
# Otherwise the user could inject plaintext search queries
if not user.has_perm("bypass_hashing"):
data_dict[key] = SearchDenied(value=data_dict[key])
# del data_dict[key]
for hash in hashes:
hash_list.add(hash)
if hash_list:
values = r.hmget(cache, *hash_list)
@ -147,8 +172,17 @@ def hash_lookup(data_dict):
for key in data_dict.keys():
for hash in total:
if data_dict[key]:
if hash in data_dict[key]:
data_dict[key] = data_dict[key].replace(f"{hash}", total[hash])
if isinstance(data_dict[key], str):
if hash in data_dict[key]:
print("Replacing", data_dict[key], "with", total[hash])
data_dict[key] = data_dict[key].replace(
f"{hash}", total[hash]
)
elif isinstance(data_dict[key], dict):
for k2, v2 in data_dict[key].items():
if hash in v2:
print("Replacing", v2, "with", total[hash])
data_dict[key][k2] = v2.replace(f"{hash}", total[hash])
def encrypt_list(user, data, secret):

@ -55,6 +55,21 @@ def create_tags(query):
return tags
def parse_tags(tags_pre):
"""
Parse the tags from the variable tags_pre.
"""
tags = {}
tags_spl = tags_pre.split(",")
if tags_spl:
for tag in tags_spl:
tag = tag.split(": ")
if len(tag) == 2:
key, val = tag
tags[key] = val
return tags
def make_table(context):
table = DrilldownTable(context["object_list"])
context["table"] = table
@ -78,6 +93,8 @@ def make_graph(results):
def drilldown_search(request, return_context=False, template=None):
extra_params = {}
if not template:
template_name = "ui/drilldown/table_results.html"
else:
@ -105,6 +122,10 @@ def drilldown_search(request, return_context=False, template=None):
query_params.update(tmp_post)
query_params.update(tmp_get)
# URI we're passing to the template for linking
if "csrfmiddlewaretoken" in query_params:
del query_params["csrfmiddlewaretoken"]
# Parse the dates
if "dates" in query_params:
dates = parse_dates(query_params["dates"])
@ -118,24 +139,34 @@ def drilldown_search(request, return_context=False, template=None):
query_params["to_time"] = dates["to_time"]
if "query" in query_params:
context = query_results(request, query_params)
# Remove null values
if query_params["query"] == "":
del query_params["query"]
# Turn the query into tags for populating the taglist
# tags = create_tags(query_params["query"])
# context["tags"] = tags
# else:
# context = {"object_list": []}
# Remove null values
if "query_full" in query_params:
if query_params["query_full"] == "":
del query_params["query_full"]
if "tags" in query_params:
if query_params["tags"] == "":
del query_params["tags"]
else:
tags = parse_tags(query_params["tags"])
extra_params["tags"] = tags
# Turn the query into tags for populating the taglist
tags = create_tags(query_params["query"])
context["tags"] = tags
else:
context = {"object_list": []}
context = query_results(request, query_params, **extra_params)
# Valid sizes
context["sizes"] = sizes
# URI we're passing to the template for linking
if "csrfmiddlewaretoken" in query_params:
del query_params["csrfmiddlewaretoken"]
url_params = urllib.parse.urlencode(query_params)
context["client_uri"] = url_params
context["params"] = query_params
if "message" in context:
response = render(request, template_name, context)
@ -158,6 +189,17 @@ def drilldown_search(request, return_context=False, template=None):
clean_url_params = urllib.parse.urlencode(clean_params)
context["uri"] = clean_url_params
# Warn users trying to use query string that the simple query supersedes it
if all([x in query_params for x in ["query", "query_full"]]):
context["message"] = (
"You are searching with both query types. "
"The simple query will be used. "
"The full query will be ignored. "
"Remove the text from the simple query if you wish "
"to use the full query."
)
context["class"] = "warning"
response = render(request, template_name, context)
if request.GET:
if request.htmx:
@ -260,7 +302,7 @@ class DrilldownContextModal(APIView):
# Lookup the hash values but don't disclose them to the user
if settings.HASHING:
SAFE_PARAMS = deepcopy(query_params)
hash_lookup(SAFE_PARAMS)
hash_lookup(request.user, SAFE_PARAMS)
else:
SAFE_PARAMS = query_params
@ -383,7 +425,7 @@ class ThresholdInfoModal(APIView):
# Lookup the hash values but don't disclose them to the user
if settings.HASHING:
SAFE_PARAMS = request.data.dict()
hash_lookup(SAFE_PARAMS)
hash_lookup(request.user, SAFE_PARAMS)
safe_net = SAFE_PARAMS["net"]
safe_nick = SAFE_PARAMS["nick"]
safe_channel = SAFE_PARAMS["channel"]

Loading…
Cancel
Save