Implement redaction for search results

This commit is contained in:
Mark Veidemanis 2022-07-21 13:49:27 +01:00
parent dfabddb6b1
commit 2362048cc7
Signed by: m
GPG Key ID: 5ACFCEED46C0904F
4 changed files with 36 additions and 3 deletions

View File

@ -10,6 +10,11 @@ OPENSEARCH_INDEX_META = "meta"
OPENSEARCH_MAIN_SEARCH_FIELDS = ["msg"] OPENSEARCH_MAIN_SEARCH_FIELDS = ["msg"]
OPENSEARCH_BLACKLISTED = {
"msg": ["example.com"],
"nick": ["me"],
}
# URLs # URLs
DOMAIN = "example.com" DOMAIN = "example.com"
URL = f"https://{DOMAIN}" URL = f"https://{DOMAIN}"

View File

@ -1,6 +1,10 @@
import pprint
from django.conf import settings from django.conf import settings
from opensearchpy import OpenSearch from opensearchpy import OpenSearch
pp = pprint.PrettyPrinter(indent=4)
def initialise_opensearch(): def initialise_opensearch():
auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD) auth = (settings.OPENSEARCH_USERNAME, settings.OPENSEARCH_PASSWORD)
@ -57,9 +61,33 @@ def construct_query(query, fields, results):
return query return query
def filter_blacklisted(response):
pp.pprint(response["hits"]["hits"])
print("LEN", len(response["hits"]["hits"]))
response["redacted"] = 0
# For every hit from ES
for item in list(response["hits"]["hits"]):
# For every blacklisted type
for blacklisted_type in settings.OPENSEARCH_BLACKLISTED.keys():
# Check this field we are matching exists
if blacklisted_type in item["_source"].keys():
content = item["_source"][blacklisted_type]
# For every item in the blacklisted array for the type
for blacklisted_item in settings.OPENSEARCH_BLACKLISTED[
blacklisted_type
]:
if blacklisted_item in str(content):
# Remove the item
if item in response["hits"]["hits"]:
response["hits"]["hits"].remove(item)
# Let the UI know something was redacted
response["redacted"] += 1
def run_main_query(client, query, fields=None, results=None): def run_main_query(client, query, fields=None, results=None):
search_query = construct_query(query, fields, results) search_query = construct_query(query, fields, results)
# fmt: off # fmt: off
response = client.search(body=search_query, response = client.search(body=search_query,
index=settings.OPENSEARCH_INDEX_MAIN) index=settings.OPENSEARCH_INDEX_MAIN)
filter_blacklisted(response)
return response return response

View File

@ -126,6 +126,7 @@
</div> </div>
</div> </div>
<p>{{ card }} hits</p> <p>{{ card }} hits</p>
<p>{{ redacted }} redacted</p>
<p>{{ took }}ms</p> <p>{{ took }}ms</p>
{% endif %} {% endif %}

View File

@ -27,16 +27,14 @@ class Drilldown(LoginRequiredMixin, View):
fields = None fields = None
if "fields" in request.POST: if "fields" in request.POST:
fields = request.POST.getlist("fields") fields = request.POST.getlist("fields")
print("FIELD", fields)
if "results" in request.POST: if "results" in request.POST:
results = request.POST["results"] results = request.POST["results"]
print("RESULTS", results)
if "query" in request.POST: if "query" in request.POST:
query = request.POST["query"] query = request.POST["query"]
# field = results.POST["field"] # field = results.POST["field"]
# print("FIELD ", field) # print("FIELD ", field)
results = run_main_query(client, query, fields, results) results = run_main_query(client, query, fields, results)
pp.pprint(results) # pp.pprint(results)
results_parsed = [] results_parsed = []
if "hits" in results.keys(): if "hits" in results.keys():
if "hits" in results["hits"]: if "hits" in results["hits"]:
@ -47,6 +45,7 @@ class Drilldown(LoginRequiredMixin, View):
"results": results_parsed, "results": results_parsed,
"card": results["hits"]["total"]["value"], "card": results["hits"]["total"]["value"],
"took": results["took"], "took": results["took"],
"redacted": results["redacted"],
"fields": settings.OPENSEARCH_MAIN_SEARCH_FIELDS, "fields": settings.OPENSEARCH_MAIN_SEARCH_FIELDS,
} }
return render(request, self.template_name, context) return render(request, self.template_name, context)