Increase platform abstraction cohesion
This commit is contained in:
419
core/memory/pipeline.py
Normal file
419
core/memory/pipeline.py
Normal file
@@ -0,0 +1,419 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import timezone as dt_timezone
|
||||
from typing import Any
|
||||
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from django.utils.dateparse import parse_datetime
|
||||
|
||||
from core.models import (
|
||||
MemoryChangeRequest,
|
||||
MemoryItem,
|
||||
MemorySourceReference,
|
||||
MessageEvent,
|
||||
WorkspaceConversation,
|
||||
)
|
||||
from core.util import logs
|
||||
|
||||
log = logs.get_logger("memory-pipeline")
|
||||
|
||||
_LIKE_RE = re.compile(
|
||||
r"\b(?:i (?:like|love|prefer)|my favorite)\s+(?P<value>[^.!?]{2,120})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_DISLIKE_RE = re.compile(
|
||||
r"\b(?:i (?:dislike|hate|avoid)|i don't like)\s+(?P<value>[^.!?]{2,120})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_STYLE_RE = re.compile(
|
||||
r"\b(?:please|pls)\s+(?P<value>[^.!?]{3,120})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _clean_value(value: str) -> str:
|
||||
return " ".join(str(value or "").strip().split())
|
||||
|
||||
|
||||
def extract_memory_candidates(text: str) -> list[dict[str, Any]]:
|
||||
source = str(text or "").strip()
|
||||
if not source:
|
||||
return []
|
||||
|
||||
candidates: list[dict[str, Any]] = []
|
||||
for regex, field, kind, confidence in (
|
||||
(_LIKE_RE, "likes", "fact", 0.68),
|
||||
(_DISLIKE_RE, "dislikes", "fact", 0.68),
|
||||
(_STYLE_RE, "communication_style", "state", 0.52),
|
||||
):
|
||||
for match in regex.finditer(source):
|
||||
value = _clean_value(match.group("value"))
|
||||
if len(value) < 3:
|
||||
continue
|
||||
candidates.append(
|
||||
{
|
||||
"memory_kind": kind,
|
||||
"field": field,
|
||||
"text": value,
|
||||
"confidence_score": confidence,
|
||||
}
|
||||
)
|
||||
return candidates
|
||||
|
||||
|
||||
def _existing_fingerprints(user_id: int) -> set[tuple[str, str, str, str]]:
|
||||
items = MemoryItem.objects.filter(user_id=int(user_id)).only(
|
||||
"memory_kind",
|
||||
"conversation_id",
|
||||
"person_id",
|
||||
"content",
|
||||
)
|
||||
fingerprints = set()
|
||||
for item in items:
|
||||
content = item.content or {}
|
||||
field = str(content.get("field") or "").strip().lower()
|
||||
text = _clean_value(str(content.get("text") or "")).lower()
|
||||
fingerprints.add(
|
||||
(
|
||||
str(item.memory_kind or "").strip().lower(),
|
||||
str(item.conversation_id or "").strip(),
|
||||
str(item.person_id or "").strip(),
|
||||
f"{field}:{text}",
|
||||
)
|
||||
)
|
||||
return fingerprints
|
||||
|
||||
|
||||
def _infer_single_person_id(conversation: WorkspaceConversation) -> str:
|
||||
participant_ids = list(conversation.participants.values_list("id", flat=True)[:2])
|
||||
if len(participant_ids) != 1:
|
||||
return ""
|
||||
return str(participant_ids[0] or "")
|
||||
|
||||
|
||||
@transaction.atomic
|
||||
def suggest_memories_from_recent_messages(
|
||||
*,
|
||||
user_id: int,
|
||||
limit_messages: int = 300,
|
||||
max_items: int = 30,
|
||||
) -> dict[str, int]:
|
||||
safe_limit_messages = max(1, min(2000, int(limit_messages or 300)))
|
||||
safe_max_items = max(1, min(500, int(max_items or 30)))
|
||||
existing = _existing_fingerprints(int(user_id))
|
||||
|
||||
scanned = 0
|
||||
queued = 0
|
||||
rows = (
|
||||
MessageEvent.objects.filter(user_id=int(user_id), direction="in")
|
||||
.select_related("conversation")
|
||||
.order_by("-ts")[:safe_limit_messages]
|
||||
)
|
||||
for event in rows:
|
||||
scanned += 1
|
||||
person_id = _infer_single_person_id(event.conversation)
|
||||
for candidate in extract_memory_candidates(event.text or ""):
|
||||
field = str(candidate.get("field") or "").strip().lower()
|
||||
text = _clean_value(str(candidate.get("text") or ""))
|
||||
if not text:
|
||||
continue
|
||||
fingerprint = (
|
||||
str(candidate.get("memory_kind") or "fact").strip().lower(),
|
||||
str(event.conversation_id or "").strip(),
|
||||
person_id,
|
||||
f"{field}:{text.lower()}",
|
||||
)
|
||||
if fingerprint in existing:
|
||||
continue
|
||||
|
||||
item = MemoryItem.objects.create(
|
||||
user_id=int(user_id),
|
||||
conversation=event.conversation,
|
||||
person_id=person_id or None,
|
||||
memory_kind=str(candidate.get("memory_kind") or "fact"),
|
||||
status="proposed",
|
||||
content={"field": field, "text": text},
|
||||
provenance={
|
||||
"pipeline": "message_regex",
|
||||
"message_event_id": str(event.id),
|
||||
},
|
||||
confidence_score=float(candidate.get("confidence_score") or 0.5),
|
||||
)
|
||||
MemorySourceReference.objects.create(
|
||||
memory=item,
|
||||
message_event=event,
|
||||
source_label="message_event",
|
||||
)
|
||||
MemoryChangeRequest.objects.create(
|
||||
user_id=int(user_id),
|
||||
memory=item,
|
||||
conversation=event.conversation,
|
||||
person_id=person_id or None,
|
||||
action="create",
|
||||
status="pending",
|
||||
proposed_memory_kind=item.memory_kind,
|
||||
proposed_content=item.content,
|
||||
proposed_confidence_score=item.confidence_score,
|
||||
reason="Auto-suggested from recent inbound messages.",
|
||||
requested_by_identifier="memory-pipeline",
|
||||
)
|
||||
existing.add(fingerprint)
|
||||
queued += 1
|
||||
if queued >= safe_max_items:
|
||||
return {"scanned": scanned, "queued": queued}
|
||||
return {"scanned": scanned, "queued": queued}
|
||||
|
||||
|
||||
def _coerce_expires_at(value: Any):
|
||||
raw = str(value or "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
parsed = parse_datetime(raw)
|
||||
if parsed is None:
|
||||
raise ValueError("expires_at must be an ISO datetime")
|
||||
if parsed.tzinfo is None:
|
||||
return timezone.make_aware(parsed, dt_timezone.utc)
|
||||
return parsed
|
||||
|
||||
|
||||
@transaction.atomic
|
||||
def create_memory_change_request(
|
||||
*,
|
||||
user_id: int,
|
||||
action: str,
|
||||
conversation_id: str = "",
|
||||
person_id: str = "",
|
||||
memory_id: str = "",
|
||||
memory_kind: str = "",
|
||||
content: dict[str, Any] | None = None,
|
||||
confidence_score: float | None = None,
|
||||
expires_at: str = "",
|
||||
reason: str = "",
|
||||
requested_by_identifier: str = "",
|
||||
) -> MemoryChangeRequest:
|
||||
normalized_action = str(action or "").strip().lower()
|
||||
if normalized_action not in {"create", "update", "delete"}:
|
||||
raise ValueError("action must be create/update/delete")
|
||||
|
||||
memory = None
|
||||
if memory_id:
|
||||
memory = MemoryItem.objects.filter(user_id=int(user_id), id=memory_id).first()
|
||||
if memory is None:
|
||||
raise ValueError("memory_id not found")
|
||||
|
||||
conversation = None
|
||||
if conversation_id:
|
||||
conversation = WorkspaceConversation.objects.filter(
|
||||
user_id=int(user_id),
|
||||
id=conversation_id,
|
||||
).first()
|
||||
if conversation is None:
|
||||
raise ValueError("conversation_id not found")
|
||||
|
||||
if normalized_action == "create" and conversation is None:
|
||||
raise ValueError("conversation_id is required for create")
|
||||
if normalized_action in {"update", "delete"} and memory is None:
|
||||
raise ValueError("memory_id is required for update/delete")
|
||||
|
||||
return MemoryChangeRequest.objects.create(
|
||||
user_id=int(user_id),
|
||||
memory=memory,
|
||||
conversation=conversation or (memory.conversation if memory else None),
|
||||
person_id=person_id or (str(memory.person_id or "") if memory else "") or None,
|
||||
action=normalized_action,
|
||||
status="pending",
|
||||
proposed_memory_kind=str(memory_kind or (memory.memory_kind if memory else "")).strip(),
|
||||
proposed_content=dict(content or {}),
|
||||
proposed_confidence_score=(
|
||||
float(confidence_score)
|
||||
if confidence_score is not None
|
||||
else (float(memory.confidence_score) if memory else None)
|
||||
),
|
||||
proposed_expires_at=_coerce_expires_at(expires_at),
|
||||
reason=str(reason or "").strip(),
|
||||
requested_by_identifier=str(requested_by_identifier or "").strip(),
|
||||
)
|
||||
|
||||
|
||||
@transaction.atomic
|
||||
def review_memory_change_request(
|
||||
*,
|
||||
user_id: int,
|
||||
request_id: str,
|
||||
decision: str,
|
||||
reviewer_identifier: str = "",
|
||||
note: str = "",
|
||||
) -> MemoryChangeRequest:
|
||||
req = MemoryChangeRequest.objects.select_related("memory", "conversation").get(
|
||||
id=request_id,
|
||||
user_id=int(user_id),
|
||||
)
|
||||
if req.status != "pending":
|
||||
raise ValueError("request is not pending")
|
||||
|
||||
now = timezone.now()
|
||||
normalized_decision = str(decision or "").strip().lower()
|
||||
if normalized_decision not in {"approve", "reject"}:
|
||||
raise ValueError("decision must be approve/reject")
|
||||
|
||||
req.reviewed_by_identifier = str(reviewer_identifier or "").strip()
|
||||
req.reviewed_at = now
|
||||
if note:
|
||||
req.reason = f"{req.reason}\n\nReview note: {str(note).strip()}".strip()
|
||||
|
||||
if normalized_decision == "reject":
|
||||
req.status = "rejected"
|
||||
req.save(
|
||||
update_fields=[
|
||||
"status",
|
||||
"reviewed_by_identifier",
|
||||
"reviewed_at",
|
||||
"reason",
|
||||
"updated_at",
|
||||
]
|
||||
)
|
||||
return req
|
||||
|
||||
req.status = "approved"
|
||||
req.save(
|
||||
update_fields=[
|
||||
"status",
|
||||
"reviewed_by_identifier",
|
||||
"reviewed_at",
|
||||
"reason",
|
||||
"updated_at",
|
||||
]
|
||||
)
|
||||
|
||||
memory = req.memory
|
||||
if req.action == "create":
|
||||
if memory is None:
|
||||
if req.conversation is None:
|
||||
raise ValueError("approved create request missing conversation")
|
||||
memory = MemoryItem.objects.create(
|
||||
user_id=int(user_id),
|
||||
conversation=req.conversation,
|
||||
person_id=req.person_id,
|
||||
memory_kind=req.proposed_memory_kind or "fact",
|
||||
status="active",
|
||||
content=req.proposed_content or {},
|
||||
confidence_score=float(req.proposed_confidence_score or 0.5),
|
||||
expires_at=req.proposed_expires_at,
|
||||
last_verified_at=now,
|
||||
provenance={"approved_request_id": str(req.id)},
|
||||
)
|
||||
req.memory = memory
|
||||
else:
|
||||
memory.status = "active"
|
||||
memory.last_verified_at = now
|
||||
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
|
||||
elif req.action == "update":
|
||||
if memory is None:
|
||||
raise ValueError("approved update request missing memory")
|
||||
if req.proposed_memory_kind:
|
||||
memory.memory_kind = req.proposed_memory_kind
|
||||
if req.proposed_content:
|
||||
memory.content = req.proposed_content
|
||||
if req.proposed_confidence_score is not None:
|
||||
memory.confidence_score = float(req.proposed_confidence_score)
|
||||
memory.expires_at = req.proposed_expires_at
|
||||
memory.last_verified_at = now
|
||||
memory.status = "active"
|
||||
memory.save()
|
||||
else:
|
||||
if memory is None:
|
||||
raise ValueError("approved delete request missing memory")
|
||||
memory.status = "deprecated"
|
||||
memory.last_verified_at = now
|
||||
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
|
||||
|
||||
req.status = "applied"
|
||||
req.save(update_fields=["status", "memory", "updated_at"])
|
||||
return req
|
||||
|
||||
|
||||
@transaction.atomic
|
||||
def run_memory_hygiene(*, user_id: int | None = None, dry_run: bool = False) -> dict[str, int]:
|
||||
now = timezone.now()
|
||||
queryset = MemoryItem.objects.filter(status="active")
|
||||
if user_id is not None:
|
||||
queryset = queryset.filter(user_id=int(user_id))
|
||||
|
||||
expired_ids = list(
|
||||
queryset.filter(expires_at__isnull=False, expires_at__lte=now).values_list(
|
||||
"id",
|
||||
flat=True,
|
||||
)
|
||||
)
|
||||
expired = len(expired_ids)
|
||||
if expired and not dry_run:
|
||||
MemoryItem.objects.filter(id__in=expired_ids).update(status="deprecated")
|
||||
|
||||
contradictions = 0
|
||||
queued = 0
|
||||
grouped: dict[tuple[int, str, str, str, str], dict[str, list[MemoryItem]]] = {}
|
||||
for item in queryset.select_related("conversation", "person"):
|
||||
content = item.content or {}
|
||||
field = str(content.get("field") or content.get("key") or "").strip().lower()
|
||||
text = _clean_value(str(content.get("text") or content.get("value") or "")).lower()
|
||||
if not field or not text:
|
||||
continue
|
||||
scope = (
|
||||
int(item.user_id),
|
||||
str(item.person_id or ""),
|
||||
str(item.conversation_id or ""),
|
||||
str(item.memory_kind or ""),
|
||||
field,
|
||||
)
|
||||
grouped.setdefault(scope, {})
|
||||
grouped[scope].setdefault(text, [])
|
||||
grouped[scope][text].append(item)
|
||||
|
||||
for values in grouped.values():
|
||||
if len(values.keys()) <= 1:
|
||||
continue
|
||||
flat = [item for subset in values.values() for item in subset]
|
||||
contradictions += len(flat)
|
||||
if dry_run:
|
||||
continue
|
||||
for item in flat:
|
||||
already_pending = MemoryChangeRequest.objects.filter(
|
||||
user_id=item.user_id,
|
||||
memory=item,
|
||||
action="update",
|
||||
status="pending",
|
||||
reason__icontains="contradiction",
|
||||
).exists()
|
||||
if already_pending:
|
||||
continue
|
||||
MemoryChangeRequest.objects.create(
|
||||
user_id=item.user_id,
|
||||
memory=item,
|
||||
conversation=item.conversation,
|
||||
person=item.person,
|
||||
action="update",
|
||||
status="pending",
|
||||
proposed_memory_kind=item.memory_kind,
|
||||
proposed_content=item.content,
|
||||
proposed_confidence_score=item.confidence_score,
|
||||
proposed_expires_at=item.expires_at,
|
||||
reason="Contradiction detected by hygiene job.",
|
||||
requested_by_identifier="memory-hygiene",
|
||||
)
|
||||
queued += 1
|
||||
|
||||
log.info(
|
||||
"memory hygiene user=%s dry_run=%s expired=%s contradictions=%s queued=%s",
|
||||
user_id if user_id is not None else "-",
|
||||
dry_run,
|
||||
expired,
|
||||
contradictions,
|
||||
queued,
|
||||
)
|
||||
return {
|
||||
"expired": expired,
|
||||
"contradictions": contradictions,
|
||||
"queued_requests": queued,
|
||||
}
|
||||
Reference in New Issue
Block a user