Implement attachment view
This commit is contained in:
162
core/management/commands/backfill_xmpp_attachment_urls.py
Normal file
162
core/management/commands/backfill_xmpp_attachment_urls.py
Normal file
@@ -0,0 +1,162 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models import Q
|
||||
|
||||
from core.models import Message, MessageEvent
|
||||
|
||||
|
||||
EMPTY_TEXT_VALUES = {
|
||||
"",
|
||||
"[No Body]",
|
||||
"[no body]",
|
||||
}
|
||||
|
||||
|
||||
def _normalize_url(value):
|
||||
if not value:
|
||||
return ""
|
||||
text = str(value).strip().rstrip(".,);:!?\"'")
|
||||
if text.startswith("http://") or text.startswith("https://"):
|
||||
return text
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_urls_from_attachment_blob(blob):
|
||||
urls = []
|
||||
if isinstance(blob, str):
|
||||
normalized = _normalize_url(blob)
|
||||
if normalized:
|
||||
urls.append(normalized)
|
||||
return urls
|
||||
|
||||
if isinstance(blob, dict):
|
||||
for key in ("url", "source_url", "download_url", "proxy_url", "href"):
|
||||
normalized = _normalize_url(blob.get(key))
|
||||
if normalized:
|
||||
urls.append(normalized)
|
||||
nested = blob.get("attachments")
|
||||
if isinstance(nested, list):
|
||||
for row in nested:
|
||||
urls.extend(_extract_urls_from_attachment_blob(row))
|
||||
return urls
|
||||
|
||||
if isinstance(blob, list):
|
||||
for row in blob:
|
||||
urls.extend(_extract_urls_from_attachment_blob(row))
|
||||
return urls
|
||||
|
||||
|
||||
def _uniq(values):
|
||||
seen = set()
|
||||
output = []
|
||||
for value in values:
|
||||
if value in seen:
|
||||
continue
|
||||
seen.add(value)
|
||||
output.append(value)
|
||||
return output
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = (
|
||||
"Backfill empty Message.text rows originating from XMPP by recovering "
|
||||
"attachment URLs from MessageEvent metadata."
|
||||
)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--apply",
|
||||
action="store_true",
|
||||
help="Persist updates. Without this flag, runs as dry-run.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--user-id",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Limit processing to one user ID.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Maximum number of candidate rows to inspect (0 = no limit).",
|
||||
)
|
||||
|
||||
def _candidate_events(self, message):
|
||||
linked = MessageEvent.objects.filter(
|
||||
user=message.user,
|
||||
raw_payload_ref__legacy_message_id=str(message.id),
|
||||
)
|
||||
if linked.exists():
|
||||
return linked
|
||||
|
||||
# Fallback heuristic for older rows with missing legacy refs.
|
||||
window = 2000
|
||||
return MessageEvent.objects.filter(
|
||||
user=message.user,
|
||||
source_system="xmpp",
|
||||
ts__gte=int(message.ts) - window,
|
||||
ts__lte=int(message.ts) + window,
|
||||
).exclude(attachments=[])
|
||||
|
||||
def handle(self, *args, **options):
|
||||
apply_changes = bool(options.get("apply"))
|
||||
user_id = options.get("user_id")
|
||||
limit = int(options.get("limit") or 0)
|
||||
|
||||
queryset = Message.objects.filter(
|
||||
sender_uuid__iexact="xmpp",
|
||||
).filter(
|
||||
Q(text__isnull=True) | Q(text__exact="") | Q(text__iexact="[No Body]")
|
||||
)
|
||||
if user_id:
|
||||
queryset = queryset.filter(user_id=user_id)
|
||||
queryset = queryset.order_by("ts", "id")
|
||||
if limit > 0:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
inspected = 0
|
||||
recoverable = 0
|
||||
updated = 0
|
||||
unrecoverable = 0
|
||||
|
||||
for message in queryset.iterator():
|
||||
inspected += 1
|
||||
current_text = str(message.text or "").strip()
|
||||
if current_text not in EMPTY_TEXT_VALUES:
|
||||
continue
|
||||
|
||||
urls = []
|
||||
for event in self._candidate_events(message):
|
||||
urls.extend(_extract_urls_from_attachment_blob(event.attachments))
|
||||
urls.extend(
|
||||
_extract_urls_from_attachment_blob(event.raw_payload_ref or {})
|
||||
)
|
||||
if urls:
|
||||
break
|
||||
|
||||
urls = _uniq(urls)
|
||||
if not urls:
|
||||
unrecoverable += 1
|
||||
continue
|
||||
|
||||
recoverable += 1
|
||||
new_text = "\n".join(urls)
|
||||
if apply_changes:
|
||||
message.text = new_text
|
||||
message.save(update_fields=["text"])
|
||||
updated += 1
|
||||
else:
|
||||
self.stdout.write(
|
||||
f"[dry-run] {message.id}: would set {len(urls)} URL(s)"
|
||||
)
|
||||
|
||||
mode = "apply" if apply_changes else "dry-run"
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(
|
||||
"XMPP attachment URL backfill complete "
|
||||
f"({mode}): inspected={inspected}, "
|
||||
f"recoverable={recoverable}, "
|
||||
f"updated={updated}, "
|
||||
f"unrecoverable={unrecoverable}"
|
||||
)
|
||||
)
|
||||
Reference in New Issue
Block a user