420 lines
14 KiB
Python
420 lines
14 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import timezone as dt_timezone
|
|
from typing import Any
|
|
|
|
from django.db import transaction
|
|
from django.utils import timezone
|
|
from django.utils.dateparse import parse_datetime
|
|
|
|
from core.models import (
|
|
MemoryChangeRequest,
|
|
MemoryItem,
|
|
MemorySourceReference,
|
|
MessageEvent,
|
|
WorkspaceConversation,
|
|
)
|
|
from core.util import logs
|
|
|
|
log = logs.get_logger("memory-pipeline")
|
|
|
|
_LIKE_RE = re.compile(
|
|
r"\b(?:i (?:like|love|prefer)|my favorite)\s+(?P<value>[^.!?]{2,120})",
|
|
re.IGNORECASE,
|
|
)
|
|
_DISLIKE_RE = re.compile(
|
|
r"\b(?:i (?:dislike|hate|avoid)|i don't like)\s+(?P<value>[^.!?]{2,120})",
|
|
re.IGNORECASE,
|
|
)
|
|
_STYLE_RE = re.compile(
|
|
r"\b(?:please|pls)\s+(?P<value>[^.!?]{3,120})",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _clean_value(value: str) -> str:
|
|
return " ".join(str(value or "").strip().split())
|
|
|
|
|
|
def extract_memory_candidates(text: str) -> list[dict[str, Any]]:
|
|
source = str(text or "").strip()
|
|
if not source:
|
|
return []
|
|
|
|
candidates: list[dict[str, Any]] = []
|
|
for regex, field, kind, confidence in (
|
|
(_LIKE_RE, "likes", "fact", 0.68),
|
|
(_DISLIKE_RE, "dislikes", "fact", 0.68),
|
|
(_STYLE_RE, "communication_style", "state", 0.52),
|
|
):
|
|
for match in regex.finditer(source):
|
|
value = _clean_value(match.group("value"))
|
|
if len(value) < 3:
|
|
continue
|
|
candidates.append(
|
|
{
|
|
"memory_kind": kind,
|
|
"field": field,
|
|
"text": value,
|
|
"confidence_score": confidence,
|
|
}
|
|
)
|
|
return candidates
|
|
|
|
|
|
def _existing_fingerprints(user_id: int) -> set[tuple[str, str, str, str]]:
|
|
items = MemoryItem.objects.filter(user_id=int(user_id)).only(
|
|
"memory_kind",
|
|
"conversation_id",
|
|
"person_id",
|
|
"content",
|
|
)
|
|
fingerprints = set()
|
|
for item in items:
|
|
content = item.content or {}
|
|
field = str(content.get("field") or "").strip().lower()
|
|
text = _clean_value(str(content.get("text") or "")).lower()
|
|
fingerprints.add(
|
|
(
|
|
str(item.memory_kind or "").strip().lower(),
|
|
str(item.conversation_id or "").strip(),
|
|
str(item.person_id or "").strip(),
|
|
f"{field}:{text}",
|
|
)
|
|
)
|
|
return fingerprints
|
|
|
|
|
|
def _infer_single_person_id(conversation: WorkspaceConversation) -> str:
|
|
participant_ids = list(conversation.participants.values_list("id", flat=True)[:2])
|
|
if len(participant_ids) != 1:
|
|
return ""
|
|
return str(participant_ids[0] or "")
|
|
|
|
|
|
@transaction.atomic
|
|
def suggest_memories_from_recent_messages(
|
|
*,
|
|
user_id: int,
|
|
limit_messages: int = 300,
|
|
max_items: int = 30,
|
|
) -> dict[str, int]:
|
|
safe_limit_messages = max(1, min(2000, int(limit_messages or 300)))
|
|
safe_max_items = max(1, min(500, int(max_items or 30)))
|
|
existing = _existing_fingerprints(int(user_id))
|
|
|
|
scanned = 0
|
|
queued = 0
|
|
rows = (
|
|
MessageEvent.objects.filter(user_id=int(user_id), direction="in")
|
|
.select_related("conversation")
|
|
.order_by("-ts")[:safe_limit_messages]
|
|
)
|
|
for event in rows:
|
|
scanned += 1
|
|
person_id = _infer_single_person_id(event.conversation)
|
|
for candidate in extract_memory_candidates(event.text or ""):
|
|
field = str(candidate.get("field") or "").strip().lower()
|
|
text = _clean_value(str(candidate.get("text") or ""))
|
|
if not text:
|
|
continue
|
|
fingerprint = (
|
|
str(candidate.get("memory_kind") or "fact").strip().lower(),
|
|
str(event.conversation_id or "").strip(),
|
|
person_id,
|
|
f"{field}:{text.lower()}",
|
|
)
|
|
if fingerprint in existing:
|
|
continue
|
|
|
|
item = MemoryItem.objects.create(
|
|
user_id=int(user_id),
|
|
conversation=event.conversation,
|
|
person_id=person_id or None,
|
|
memory_kind=str(candidate.get("memory_kind") or "fact"),
|
|
status="proposed",
|
|
content={"field": field, "text": text},
|
|
provenance={
|
|
"pipeline": "message_regex",
|
|
"message_event_id": str(event.id),
|
|
},
|
|
confidence_score=float(candidate.get("confidence_score") or 0.5),
|
|
)
|
|
MemorySourceReference.objects.create(
|
|
memory=item,
|
|
message_event=event,
|
|
source_label="message_event",
|
|
)
|
|
MemoryChangeRequest.objects.create(
|
|
user_id=int(user_id),
|
|
memory=item,
|
|
conversation=event.conversation,
|
|
person_id=person_id or None,
|
|
action="create",
|
|
status="pending",
|
|
proposed_memory_kind=item.memory_kind,
|
|
proposed_content=item.content,
|
|
proposed_confidence_score=item.confidence_score,
|
|
reason="Auto-suggested from recent inbound messages.",
|
|
requested_by_identifier="memory-pipeline",
|
|
)
|
|
existing.add(fingerprint)
|
|
queued += 1
|
|
if queued >= safe_max_items:
|
|
return {"scanned": scanned, "queued": queued}
|
|
return {"scanned": scanned, "queued": queued}
|
|
|
|
|
|
def _coerce_expires_at(value: Any):
|
|
raw = str(value or "").strip()
|
|
if not raw:
|
|
return None
|
|
parsed = parse_datetime(raw)
|
|
if parsed is None:
|
|
raise ValueError("expires_at must be an ISO datetime")
|
|
if parsed.tzinfo is None:
|
|
return timezone.make_aware(parsed, dt_timezone.utc)
|
|
return parsed
|
|
|
|
|
|
@transaction.atomic
|
|
def create_memory_change_request(
|
|
*,
|
|
user_id: int,
|
|
action: str,
|
|
conversation_id: str = "",
|
|
person_id: str = "",
|
|
memory_id: str = "",
|
|
memory_kind: str = "",
|
|
content: dict[str, Any] | None = None,
|
|
confidence_score: float | None = None,
|
|
expires_at: str = "",
|
|
reason: str = "",
|
|
requested_by_identifier: str = "",
|
|
) -> MemoryChangeRequest:
|
|
normalized_action = str(action or "").strip().lower()
|
|
if normalized_action not in {"create", "update", "delete"}:
|
|
raise ValueError("action must be create/update/delete")
|
|
|
|
memory = None
|
|
if memory_id:
|
|
memory = MemoryItem.objects.filter(user_id=int(user_id), id=memory_id).first()
|
|
if memory is None:
|
|
raise ValueError("memory_id not found")
|
|
|
|
conversation = None
|
|
if conversation_id:
|
|
conversation = WorkspaceConversation.objects.filter(
|
|
user_id=int(user_id),
|
|
id=conversation_id,
|
|
).first()
|
|
if conversation is None:
|
|
raise ValueError("conversation_id not found")
|
|
|
|
if normalized_action == "create" and conversation is None:
|
|
raise ValueError("conversation_id is required for create")
|
|
if normalized_action in {"update", "delete"} and memory is None:
|
|
raise ValueError("memory_id is required for update/delete")
|
|
|
|
return MemoryChangeRequest.objects.create(
|
|
user_id=int(user_id),
|
|
memory=memory,
|
|
conversation=conversation or (memory.conversation if memory else None),
|
|
person_id=person_id or (str(memory.person_id or "") if memory else "") or None,
|
|
action=normalized_action,
|
|
status="pending",
|
|
proposed_memory_kind=str(memory_kind or (memory.memory_kind if memory else "")).strip(),
|
|
proposed_content=dict(content or {}),
|
|
proposed_confidence_score=(
|
|
float(confidence_score)
|
|
if confidence_score is not None
|
|
else (float(memory.confidence_score) if memory else None)
|
|
),
|
|
proposed_expires_at=_coerce_expires_at(expires_at),
|
|
reason=str(reason or "").strip(),
|
|
requested_by_identifier=str(requested_by_identifier or "").strip(),
|
|
)
|
|
|
|
|
|
@transaction.atomic
|
|
def review_memory_change_request(
|
|
*,
|
|
user_id: int,
|
|
request_id: str,
|
|
decision: str,
|
|
reviewer_identifier: str = "",
|
|
note: str = "",
|
|
) -> MemoryChangeRequest:
|
|
req = MemoryChangeRequest.objects.select_related("memory", "conversation").get(
|
|
id=request_id,
|
|
user_id=int(user_id),
|
|
)
|
|
if req.status != "pending":
|
|
raise ValueError("request is not pending")
|
|
|
|
now = timezone.now()
|
|
normalized_decision = str(decision or "").strip().lower()
|
|
if normalized_decision not in {"approve", "reject"}:
|
|
raise ValueError("decision must be approve/reject")
|
|
|
|
req.reviewed_by_identifier = str(reviewer_identifier or "").strip()
|
|
req.reviewed_at = now
|
|
if note:
|
|
req.reason = f"{req.reason}\n\nReview note: {str(note).strip()}".strip()
|
|
|
|
if normalized_decision == "reject":
|
|
req.status = "rejected"
|
|
req.save(
|
|
update_fields=[
|
|
"status",
|
|
"reviewed_by_identifier",
|
|
"reviewed_at",
|
|
"reason",
|
|
"updated_at",
|
|
]
|
|
)
|
|
return req
|
|
|
|
req.status = "approved"
|
|
req.save(
|
|
update_fields=[
|
|
"status",
|
|
"reviewed_by_identifier",
|
|
"reviewed_at",
|
|
"reason",
|
|
"updated_at",
|
|
]
|
|
)
|
|
|
|
memory = req.memory
|
|
if req.action == "create":
|
|
if memory is None:
|
|
if req.conversation is None:
|
|
raise ValueError("approved create request missing conversation")
|
|
memory = MemoryItem.objects.create(
|
|
user_id=int(user_id),
|
|
conversation=req.conversation,
|
|
person_id=req.person_id,
|
|
memory_kind=req.proposed_memory_kind or "fact",
|
|
status="active",
|
|
content=req.proposed_content or {},
|
|
confidence_score=float(req.proposed_confidence_score or 0.5),
|
|
expires_at=req.proposed_expires_at,
|
|
last_verified_at=now,
|
|
provenance={"approved_request_id": str(req.id)},
|
|
)
|
|
req.memory = memory
|
|
else:
|
|
memory.status = "active"
|
|
memory.last_verified_at = now
|
|
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
|
|
elif req.action == "update":
|
|
if memory is None:
|
|
raise ValueError("approved update request missing memory")
|
|
if req.proposed_memory_kind:
|
|
memory.memory_kind = req.proposed_memory_kind
|
|
if req.proposed_content:
|
|
memory.content = req.proposed_content
|
|
if req.proposed_confidence_score is not None:
|
|
memory.confidence_score = float(req.proposed_confidence_score)
|
|
memory.expires_at = req.proposed_expires_at
|
|
memory.last_verified_at = now
|
|
memory.status = "active"
|
|
memory.save()
|
|
else:
|
|
if memory is None:
|
|
raise ValueError("approved delete request missing memory")
|
|
memory.status = "deprecated"
|
|
memory.last_verified_at = now
|
|
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
|
|
|
|
req.status = "applied"
|
|
req.save(update_fields=["status", "memory", "updated_at"])
|
|
return req
|
|
|
|
|
|
@transaction.atomic
|
|
def run_memory_hygiene(*, user_id: int | None = None, dry_run: bool = False) -> dict[str, int]:
|
|
now = timezone.now()
|
|
queryset = MemoryItem.objects.filter(status="active")
|
|
if user_id is not None:
|
|
queryset = queryset.filter(user_id=int(user_id))
|
|
|
|
expired_ids = list(
|
|
queryset.filter(expires_at__isnull=False, expires_at__lte=now).values_list(
|
|
"id",
|
|
flat=True,
|
|
)
|
|
)
|
|
expired = len(expired_ids)
|
|
if expired and not dry_run:
|
|
MemoryItem.objects.filter(id__in=expired_ids).update(status="deprecated")
|
|
|
|
contradictions = 0
|
|
queued = 0
|
|
grouped: dict[tuple[int, str, str, str, str], dict[str, list[MemoryItem]]] = {}
|
|
for item in queryset.select_related("conversation", "person"):
|
|
content = item.content or {}
|
|
field = str(content.get("field") or content.get("key") or "").strip().lower()
|
|
text = _clean_value(str(content.get("text") or content.get("value") or "")).lower()
|
|
if not field or not text:
|
|
continue
|
|
scope = (
|
|
int(item.user_id),
|
|
str(item.person_id or ""),
|
|
str(item.conversation_id or ""),
|
|
str(item.memory_kind or ""),
|
|
field,
|
|
)
|
|
grouped.setdefault(scope, {})
|
|
grouped[scope].setdefault(text, [])
|
|
grouped[scope][text].append(item)
|
|
|
|
for values in grouped.values():
|
|
if len(values.keys()) <= 1:
|
|
continue
|
|
flat = [item for subset in values.values() for item in subset]
|
|
contradictions += len(flat)
|
|
if dry_run:
|
|
continue
|
|
for item in flat:
|
|
already_pending = MemoryChangeRequest.objects.filter(
|
|
user_id=item.user_id,
|
|
memory=item,
|
|
action="update",
|
|
status="pending",
|
|
reason__icontains="contradiction",
|
|
).exists()
|
|
if already_pending:
|
|
continue
|
|
MemoryChangeRequest.objects.create(
|
|
user_id=item.user_id,
|
|
memory=item,
|
|
conversation=item.conversation,
|
|
person=item.person,
|
|
action="update",
|
|
status="pending",
|
|
proposed_memory_kind=item.memory_kind,
|
|
proposed_content=item.content,
|
|
proposed_confidence_score=item.confidence_score,
|
|
proposed_expires_at=item.expires_at,
|
|
reason="Contradiction detected by hygiene job.",
|
|
requested_by_identifier="memory-hygiene",
|
|
)
|
|
queued += 1
|
|
|
|
log.info(
|
|
"memory hygiene user=%s dry_run=%s expired=%s contradictions=%s queued=%s",
|
|
user_id if user_id is not None else "-",
|
|
dry_run,
|
|
expired,
|
|
contradictions,
|
|
queued,
|
|
)
|
|
return {
|
|
"expired": expired,
|
|
"contradictions": contradictions,
|
|
"queued_requests": queued,
|
|
}
|