Files
GIA/core/memory/pipeline.py

420 lines
14 KiB
Python

from __future__ import annotations
import re
from datetime import timezone as dt_timezone
from typing import Any
from django.db import transaction
from django.utils import timezone
from django.utils.dateparse import parse_datetime
from core.models import (
MemoryChangeRequest,
MemoryItem,
MemorySourceReference,
MessageEvent,
WorkspaceConversation,
)
from core.util import logs
log = logs.get_logger("memory-pipeline")
_LIKE_RE = re.compile(
r"\b(?:i (?:like|love|prefer)|my favorite)\s+(?P<value>[^.!?]{2,120})",
re.IGNORECASE,
)
_DISLIKE_RE = re.compile(
r"\b(?:i (?:dislike|hate|avoid)|i don't like)\s+(?P<value>[^.!?]{2,120})",
re.IGNORECASE,
)
_STYLE_RE = re.compile(
r"\b(?:please|pls)\s+(?P<value>[^.!?]{3,120})",
re.IGNORECASE,
)
def _clean_value(value: str) -> str:
return " ".join(str(value or "").strip().split())
def extract_memory_candidates(text: str) -> list[dict[str, Any]]:
source = str(text or "").strip()
if not source:
return []
candidates: list[dict[str, Any]] = []
for regex, field, kind, confidence in (
(_LIKE_RE, "likes", "fact", 0.68),
(_DISLIKE_RE, "dislikes", "fact", 0.68),
(_STYLE_RE, "communication_style", "state", 0.52),
):
for match in regex.finditer(source):
value = _clean_value(match.group("value"))
if len(value) < 3:
continue
candidates.append(
{
"memory_kind": kind,
"field": field,
"text": value,
"confidence_score": confidence,
}
)
return candidates
def _existing_fingerprints(user_id: int) -> set[tuple[str, str, str, str]]:
items = MemoryItem.objects.filter(user_id=int(user_id)).only(
"memory_kind",
"conversation_id",
"person_id",
"content",
)
fingerprints = set()
for item in items:
content = item.content or {}
field = str(content.get("field") or "").strip().lower()
text = _clean_value(str(content.get("text") or "")).lower()
fingerprints.add(
(
str(item.memory_kind or "").strip().lower(),
str(item.conversation_id or "").strip(),
str(item.person_id or "").strip(),
f"{field}:{text}",
)
)
return fingerprints
def _infer_single_person_id(conversation: WorkspaceConversation) -> str:
participant_ids = list(conversation.participants.values_list("id", flat=True)[:2])
if len(participant_ids) != 1:
return ""
return str(participant_ids[0] or "")
@transaction.atomic
def suggest_memories_from_recent_messages(
*,
user_id: int,
limit_messages: int = 300,
max_items: int = 30,
) -> dict[str, int]:
safe_limit_messages = max(1, min(2000, int(limit_messages or 300)))
safe_max_items = max(1, min(500, int(max_items or 30)))
existing = _existing_fingerprints(int(user_id))
scanned = 0
queued = 0
rows = (
MessageEvent.objects.filter(user_id=int(user_id), direction="in")
.select_related("conversation")
.order_by("-ts")[:safe_limit_messages]
)
for event in rows:
scanned += 1
person_id = _infer_single_person_id(event.conversation)
for candidate in extract_memory_candidates(event.text or ""):
field = str(candidate.get("field") or "").strip().lower()
text = _clean_value(str(candidate.get("text") or ""))
if not text:
continue
fingerprint = (
str(candidate.get("memory_kind") or "fact").strip().lower(),
str(event.conversation_id or "").strip(),
person_id,
f"{field}:{text.lower()}",
)
if fingerprint in existing:
continue
item = MemoryItem.objects.create(
user_id=int(user_id),
conversation=event.conversation,
person_id=person_id or None,
memory_kind=str(candidate.get("memory_kind") or "fact"),
status="proposed",
content={"field": field, "text": text},
provenance={
"pipeline": "message_regex",
"message_event_id": str(event.id),
},
confidence_score=float(candidate.get("confidence_score") or 0.5),
)
MemorySourceReference.objects.create(
memory=item,
message_event=event,
source_label="message_event",
)
MemoryChangeRequest.objects.create(
user_id=int(user_id),
memory=item,
conversation=event.conversation,
person_id=person_id or None,
action="create",
status="pending",
proposed_memory_kind=item.memory_kind,
proposed_content=item.content,
proposed_confidence_score=item.confidence_score,
reason="Auto-suggested from recent inbound messages.",
requested_by_identifier="memory-pipeline",
)
existing.add(fingerprint)
queued += 1
if queued >= safe_max_items:
return {"scanned": scanned, "queued": queued}
return {"scanned": scanned, "queued": queued}
def _coerce_expires_at(value: Any):
raw = str(value or "").strip()
if not raw:
return None
parsed = parse_datetime(raw)
if parsed is None:
raise ValueError("expires_at must be an ISO datetime")
if parsed.tzinfo is None:
return timezone.make_aware(parsed, dt_timezone.utc)
return parsed
@transaction.atomic
def create_memory_change_request(
*,
user_id: int,
action: str,
conversation_id: str = "",
person_id: str = "",
memory_id: str = "",
memory_kind: str = "",
content: dict[str, Any] | None = None,
confidence_score: float | None = None,
expires_at: str = "",
reason: str = "",
requested_by_identifier: str = "",
) -> MemoryChangeRequest:
normalized_action = str(action or "").strip().lower()
if normalized_action not in {"create", "update", "delete"}:
raise ValueError("action must be create/update/delete")
memory = None
if memory_id:
memory = MemoryItem.objects.filter(user_id=int(user_id), id=memory_id).first()
if memory is None:
raise ValueError("memory_id not found")
conversation = None
if conversation_id:
conversation = WorkspaceConversation.objects.filter(
user_id=int(user_id),
id=conversation_id,
).first()
if conversation is None:
raise ValueError("conversation_id not found")
if normalized_action == "create" and conversation is None:
raise ValueError("conversation_id is required for create")
if normalized_action in {"update", "delete"} and memory is None:
raise ValueError("memory_id is required for update/delete")
return MemoryChangeRequest.objects.create(
user_id=int(user_id),
memory=memory,
conversation=conversation or (memory.conversation if memory else None),
person_id=person_id or (str(memory.person_id or "") if memory else "") or None,
action=normalized_action,
status="pending",
proposed_memory_kind=str(memory_kind or (memory.memory_kind if memory else "")).strip(),
proposed_content=dict(content or {}),
proposed_confidence_score=(
float(confidence_score)
if confidence_score is not None
else (float(memory.confidence_score) if memory else None)
),
proposed_expires_at=_coerce_expires_at(expires_at),
reason=str(reason or "").strip(),
requested_by_identifier=str(requested_by_identifier or "").strip(),
)
@transaction.atomic
def review_memory_change_request(
*,
user_id: int,
request_id: str,
decision: str,
reviewer_identifier: str = "",
note: str = "",
) -> MemoryChangeRequest:
req = MemoryChangeRequest.objects.select_related("memory", "conversation").get(
id=request_id,
user_id=int(user_id),
)
if req.status != "pending":
raise ValueError("request is not pending")
now = timezone.now()
normalized_decision = str(decision or "").strip().lower()
if normalized_decision not in {"approve", "reject"}:
raise ValueError("decision must be approve/reject")
req.reviewed_by_identifier = str(reviewer_identifier or "").strip()
req.reviewed_at = now
if note:
req.reason = f"{req.reason}\n\nReview note: {str(note).strip()}".strip()
if normalized_decision == "reject":
req.status = "rejected"
req.save(
update_fields=[
"status",
"reviewed_by_identifier",
"reviewed_at",
"reason",
"updated_at",
]
)
return req
req.status = "approved"
req.save(
update_fields=[
"status",
"reviewed_by_identifier",
"reviewed_at",
"reason",
"updated_at",
]
)
memory = req.memory
if req.action == "create":
if memory is None:
if req.conversation is None:
raise ValueError("approved create request missing conversation")
memory = MemoryItem.objects.create(
user_id=int(user_id),
conversation=req.conversation,
person_id=req.person_id,
memory_kind=req.proposed_memory_kind or "fact",
status="active",
content=req.proposed_content or {},
confidence_score=float(req.proposed_confidence_score or 0.5),
expires_at=req.proposed_expires_at,
last_verified_at=now,
provenance={"approved_request_id": str(req.id)},
)
req.memory = memory
else:
memory.status = "active"
memory.last_verified_at = now
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
elif req.action == "update":
if memory is None:
raise ValueError("approved update request missing memory")
if req.proposed_memory_kind:
memory.memory_kind = req.proposed_memory_kind
if req.proposed_content:
memory.content = req.proposed_content
if req.proposed_confidence_score is not None:
memory.confidence_score = float(req.proposed_confidence_score)
memory.expires_at = req.proposed_expires_at
memory.last_verified_at = now
memory.status = "active"
memory.save()
else:
if memory is None:
raise ValueError("approved delete request missing memory")
memory.status = "deprecated"
memory.last_verified_at = now
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
req.status = "applied"
req.save(update_fields=["status", "memory", "updated_at"])
return req
@transaction.atomic
def run_memory_hygiene(*, user_id: int | None = None, dry_run: bool = False) -> dict[str, int]:
now = timezone.now()
queryset = MemoryItem.objects.filter(status="active")
if user_id is not None:
queryset = queryset.filter(user_id=int(user_id))
expired_ids = list(
queryset.filter(expires_at__isnull=False, expires_at__lte=now).values_list(
"id",
flat=True,
)
)
expired = len(expired_ids)
if expired and not dry_run:
MemoryItem.objects.filter(id__in=expired_ids).update(status="deprecated")
contradictions = 0
queued = 0
grouped: dict[tuple[int, str, str, str, str], dict[str, list[MemoryItem]]] = {}
for item in queryset.select_related("conversation", "person"):
content = item.content or {}
field = str(content.get("field") or content.get("key") or "").strip().lower()
text = _clean_value(str(content.get("text") or content.get("value") or "")).lower()
if not field or not text:
continue
scope = (
int(item.user_id),
str(item.person_id or ""),
str(item.conversation_id or ""),
str(item.memory_kind or ""),
field,
)
grouped.setdefault(scope, {})
grouped[scope].setdefault(text, [])
grouped[scope][text].append(item)
for values in grouped.values():
if len(values.keys()) <= 1:
continue
flat = [item for subset in values.values() for item in subset]
contradictions += len(flat)
if dry_run:
continue
for item in flat:
already_pending = MemoryChangeRequest.objects.filter(
user_id=item.user_id,
memory=item,
action="update",
status="pending",
reason__icontains="contradiction",
).exists()
if already_pending:
continue
MemoryChangeRequest.objects.create(
user_id=item.user_id,
memory=item,
conversation=item.conversation,
person=item.person,
action="update",
status="pending",
proposed_memory_kind=item.memory_kind,
proposed_content=item.content,
proposed_confidence_score=item.confidence_score,
proposed_expires_at=item.expires_at,
reason="Contradiction detected by hygiene job.",
requested_by_identifier="memory-hygiene",
)
queued += 1
log.info(
"memory hygiene user=%s dry_run=%s expired=%s contradictions=%s queued=%s",
user_id if user_id is not None else "-",
dry_run,
expired,
contradictions,
queued,
)
return {
"expired": expired,
"contradictions": contradictions,
"queued_requests": queued,
}