Increase platform abstraction cohesion

This commit is contained in:
2026-03-06 17:47:58 +00:00
parent 438e561da0
commit 8c091b1e6d
55 changed files with 6555 additions and 440 deletions

View File

@@ -1,3 +1,4 @@
from .search_backend import get_memory_search_backend
from .retrieval import retrieve_memories_for_prompt
__all__ = ["get_memory_search_backend"]
__all__ = ["get_memory_search_backend", "retrieve_memories_for_prompt"]

419
core/memory/pipeline.py Normal file
View File

@@ -0,0 +1,419 @@
from __future__ import annotations
import re
from datetime import timezone as dt_timezone
from typing import Any
from django.db import transaction
from django.utils import timezone
from django.utils.dateparse import parse_datetime
from core.models import (
MemoryChangeRequest,
MemoryItem,
MemorySourceReference,
MessageEvent,
WorkspaceConversation,
)
from core.util import logs
log = logs.get_logger("memory-pipeline")
_LIKE_RE = re.compile(
r"\b(?:i (?:like|love|prefer)|my favorite)\s+(?P<value>[^.!?]{2,120})",
re.IGNORECASE,
)
_DISLIKE_RE = re.compile(
r"\b(?:i (?:dislike|hate|avoid)|i don't like)\s+(?P<value>[^.!?]{2,120})",
re.IGNORECASE,
)
_STYLE_RE = re.compile(
r"\b(?:please|pls)\s+(?P<value>[^.!?]{3,120})",
re.IGNORECASE,
)
def _clean_value(value: str) -> str:
return " ".join(str(value or "").strip().split())
def extract_memory_candidates(text: str) -> list[dict[str, Any]]:
source = str(text or "").strip()
if not source:
return []
candidates: list[dict[str, Any]] = []
for regex, field, kind, confidence in (
(_LIKE_RE, "likes", "fact", 0.68),
(_DISLIKE_RE, "dislikes", "fact", 0.68),
(_STYLE_RE, "communication_style", "state", 0.52),
):
for match in regex.finditer(source):
value = _clean_value(match.group("value"))
if len(value) < 3:
continue
candidates.append(
{
"memory_kind": kind,
"field": field,
"text": value,
"confidence_score": confidence,
}
)
return candidates
def _existing_fingerprints(user_id: int) -> set[tuple[str, str, str, str]]:
items = MemoryItem.objects.filter(user_id=int(user_id)).only(
"memory_kind",
"conversation_id",
"person_id",
"content",
)
fingerprints = set()
for item in items:
content = item.content or {}
field = str(content.get("field") or "").strip().lower()
text = _clean_value(str(content.get("text") or "")).lower()
fingerprints.add(
(
str(item.memory_kind or "").strip().lower(),
str(item.conversation_id or "").strip(),
str(item.person_id or "").strip(),
f"{field}:{text}",
)
)
return fingerprints
def _infer_single_person_id(conversation: WorkspaceConversation) -> str:
participant_ids = list(conversation.participants.values_list("id", flat=True)[:2])
if len(participant_ids) != 1:
return ""
return str(participant_ids[0] or "")
@transaction.atomic
def suggest_memories_from_recent_messages(
*,
user_id: int,
limit_messages: int = 300,
max_items: int = 30,
) -> dict[str, int]:
safe_limit_messages = max(1, min(2000, int(limit_messages or 300)))
safe_max_items = max(1, min(500, int(max_items or 30)))
existing = _existing_fingerprints(int(user_id))
scanned = 0
queued = 0
rows = (
MessageEvent.objects.filter(user_id=int(user_id), direction="in")
.select_related("conversation")
.order_by("-ts")[:safe_limit_messages]
)
for event in rows:
scanned += 1
person_id = _infer_single_person_id(event.conversation)
for candidate in extract_memory_candidates(event.text or ""):
field = str(candidate.get("field") or "").strip().lower()
text = _clean_value(str(candidate.get("text") or ""))
if not text:
continue
fingerprint = (
str(candidate.get("memory_kind") or "fact").strip().lower(),
str(event.conversation_id or "").strip(),
person_id,
f"{field}:{text.lower()}",
)
if fingerprint in existing:
continue
item = MemoryItem.objects.create(
user_id=int(user_id),
conversation=event.conversation,
person_id=person_id or None,
memory_kind=str(candidate.get("memory_kind") or "fact"),
status="proposed",
content={"field": field, "text": text},
provenance={
"pipeline": "message_regex",
"message_event_id": str(event.id),
},
confidence_score=float(candidate.get("confidence_score") or 0.5),
)
MemorySourceReference.objects.create(
memory=item,
message_event=event,
source_label="message_event",
)
MemoryChangeRequest.objects.create(
user_id=int(user_id),
memory=item,
conversation=event.conversation,
person_id=person_id or None,
action="create",
status="pending",
proposed_memory_kind=item.memory_kind,
proposed_content=item.content,
proposed_confidence_score=item.confidence_score,
reason="Auto-suggested from recent inbound messages.",
requested_by_identifier="memory-pipeline",
)
existing.add(fingerprint)
queued += 1
if queued >= safe_max_items:
return {"scanned": scanned, "queued": queued}
return {"scanned": scanned, "queued": queued}
def _coerce_expires_at(value: Any):
raw = str(value or "").strip()
if not raw:
return None
parsed = parse_datetime(raw)
if parsed is None:
raise ValueError("expires_at must be an ISO datetime")
if parsed.tzinfo is None:
return timezone.make_aware(parsed, dt_timezone.utc)
return parsed
@transaction.atomic
def create_memory_change_request(
*,
user_id: int,
action: str,
conversation_id: str = "",
person_id: str = "",
memory_id: str = "",
memory_kind: str = "",
content: dict[str, Any] | None = None,
confidence_score: float | None = None,
expires_at: str = "",
reason: str = "",
requested_by_identifier: str = "",
) -> MemoryChangeRequest:
normalized_action = str(action or "").strip().lower()
if normalized_action not in {"create", "update", "delete"}:
raise ValueError("action must be create/update/delete")
memory = None
if memory_id:
memory = MemoryItem.objects.filter(user_id=int(user_id), id=memory_id).first()
if memory is None:
raise ValueError("memory_id not found")
conversation = None
if conversation_id:
conversation = WorkspaceConversation.objects.filter(
user_id=int(user_id),
id=conversation_id,
).first()
if conversation is None:
raise ValueError("conversation_id not found")
if normalized_action == "create" and conversation is None:
raise ValueError("conversation_id is required for create")
if normalized_action in {"update", "delete"} and memory is None:
raise ValueError("memory_id is required for update/delete")
return MemoryChangeRequest.objects.create(
user_id=int(user_id),
memory=memory,
conversation=conversation or (memory.conversation if memory else None),
person_id=person_id or (str(memory.person_id or "") if memory else "") or None,
action=normalized_action,
status="pending",
proposed_memory_kind=str(memory_kind or (memory.memory_kind if memory else "")).strip(),
proposed_content=dict(content or {}),
proposed_confidence_score=(
float(confidence_score)
if confidence_score is not None
else (float(memory.confidence_score) if memory else None)
),
proposed_expires_at=_coerce_expires_at(expires_at),
reason=str(reason or "").strip(),
requested_by_identifier=str(requested_by_identifier or "").strip(),
)
@transaction.atomic
def review_memory_change_request(
*,
user_id: int,
request_id: str,
decision: str,
reviewer_identifier: str = "",
note: str = "",
) -> MemoryChangeRequest:
req = MemoryChangeRequest.objects.select_related("memory", "conversation").get(
id=request_id,
user_id=int(user_id),
)
if req.status != "pending":
raise ValueError("request is not pending")
now = timezone.now()
normalized_decision = str(decision or "").strip().lower()
if normalized_decision not in {"approve", "reject"}:
raise ValueError("decision must be approve/reject")
req.reviewed_by_identifier = str(reviewer_identifier or "").strip()
req.reviewed_at = now
if note:
req.reason = f"{req.reason}\n\nReview note: {str(note).strip()}".strip()
if normalized_decision == "reject":
req.status = "rejected"
req.save(
update_fields=[
"status",
"reviewed_by_identifier",
"reviewed_at",
"reason",
"updated_at",
]
)
return req
req.status = "approved"
req.save(
update_fields=[
"status",
"reviewed_by_identifier",
"reviewed_at",
"reason",
"updated_at",
]
)
memory = req.memory
if req.action == "create":
if memory is None:
if req.conversation is None:
raise ValueError("approved create request missing conversation")
memory = MemoryItem.objects.create(
user_id=int(user_id),
conversation=req.conversation,
person_id=req.person_id,
memory_kind=req.proposed_memory_kind or "fact",
status="active",
content=req.proposed_content or {},
confidence_score=float(req.proposed_confidence_score or 0.5),
expires_at=req.proposed_expires_at,
last_verified_at=now,
provenance={"approved_request_id": str(req.id)},
)
req.memory = memory
else:
memory.status = "active"
memory.last_verified_at = now
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
elif req.action == "update":
if memory is None:
raise ValueError("approved update request missing memory")
if req.proposed_memory_kind:
memory.memory_kind = req.proposed_memory_kind
if req.proposed_content:
memory.content = req.proposed_content
if req.proposed_confidence_score is not None:
memory.confidence_score = float(req.proposed_confidence_score)
memory.expires_at = req.proposed_expires_at
memory.last_verified_at = now
memory.status = "active"
memory.save()
else:
if memory is None:
raise ValueError("approved delete request missing memory")
memory.status = "deprecated"
memory.last_verified_at = now
memory.save(update_fields=["status", "last_verified_at", "updated_at"])
req.status = "applied"
req.save(update_fields=["status", "memory", "updated_at"])
return req
@transaction.atomic
def run_memory_hygiene(*, user_id: int | None = None, dry_run: bool = False) -> dict[str, int]:
now = timezone.now()
queryset = MemoryItem.objects.filter(status="active")
if user_id is not None:
queryset = queryset.filter(user_id=int(user_id))
expired_ids = list(
queryset.filter(expires_at__isnull=False, expires_at__lte=now).values_list(
"id",
flat=True,
)
)
expired = len(expired_ids)
if expired and not dry_run:
MemoryItem.objects.filter(id__in=expired_ids).update(status="deprecated")
contradictions = 0
queued = 0
grouped: dict[tuple[int, str, str, str, str], dict[str, list[MemoryItem]]] = {}
for item in queryset.select_related("conversation", "person"):
content = item.content or {}
field = str(content.get("field") or content.get("key") or "").strip().lower()
text = _clean_value(str(content.get("text") or content.get("value") or "")).lower()
if not field or not text:
continue
scope = (
int(item.user_id),
str(item.person_id or ""),
str(item.conversation_id or ""),
str(item.memory_kind or ""),
field,
)
grouped.setdefault(scope, {})
grouped[scope].setdefault(text, [])
grouped[scope][text].append(item)
for values in grouped.values():
if len(values.keys()) <= 1:
continue
flat = [item for subset in values.values() for item in subset]
contradictions += len(flat)
if dry_run:
continue
for item in flat:
already_pending = MemoryChangeRequest.objects.filter(
user_id=item.user_id,
memory=item,
action="update",
status="pending",
reason__icontains="contradiction",
).exists()
if already_pending:
continue
MemoryChangeRequest.objects.create(
user_id=item.user_id,
memory=item,
conversation=item.conversation,
person=item.person,
action="update",
status="pending",
proposed_memory_kind=item.memory_kind,
proposed_content=item.content,
proposed_confidence_score=item.confidence_score,
proposed_expires_at=item.expires_at,
reason="Contradiction detected by hygiene job.",
requested_by_identifier="memory-hygiene",
)
queued += 1
log.info(
"memory hygiene user=%s dry_run=%s expired=%s contradictions=%s queued=%s",
user_id if user_id is not None else "-",
dry_run,
expired,
contradictions,
queued,
)
return {
"expired": expired,
"contradictions": contradictions,
"queued_requests": queued,
}

123
core/memory/retrieval.py Normal file
View File

@@ -0,0 +1,123 @@
from __future__ import annotations
from typing import Any
from django.db.models import Q
from django.utils import timezone
from core.memory.search_backend import get_memory_search_backend
from core.models import MemoryItem
def _coerce_statuses(value: Any, default: tuple[str, ...]) -> tuple[str, ...]:
if isinstance(value, (list, tuple, set)):
items = [str(item or "").strip().lower() for item in value]
else:
items = [item.strip().lower() for item in str(value or "").split(",")]
cleaned = tuple(item for item in items if item)
return cleaned or default
def _base_queryset(
*,
user_id: int,
person_id: str = "",
conversation_id: str = "",
statuses: tuple[str, ...] = ("active",),
):
now = timezone.now()
queryset = MemoryItem.objects.filter(user_id=int(user_id))
if statuses:
queryset = queryset.filter(status__in=list(statuses))
queryset = queryset.filter(Q(expires_at__isnull=True) | Q(expires_at__gt=now))
if person_id:
queryset = queryset.filter(person_id=person_id)
if conversation_id:
queryset = queryset.filter(conversation_id=conversation_id)
return queryset
def retrieve_memories_for_prompt(
*,
user_id: int,
query: str = "",
person_id: str = "",
conversation_id: str = "",
statuses: tuple[str, ...] = ("active",),
limit: int = 20,
) -> list[dict[str, Any]]:
statuses = _coerce_statuses(statuses, ("active",))
safe_limit = max(1, min(200, int(limit or 20)))
search_text = str(query or "").strip()
if search_text:
backend = get_memory_search_backend()
hits = backend.search(
user_id=int(user_id),
query=search_text,
conversation_id=conversation_id,
limit=safe_limit,
include_statuses=statuses,
)
ids = [str(hit.memory_id or "").strip() for hit in hits if str(hit.memory_id or "").strip()]
scoped = _base_queryset(
user_id=int(user_id),
person_id=person_id,
conversation_id=conversation_id,
statuses=statuses,
).filter(id__in=ids)
by_id = {str(item.id): item for item in scoped}
rows = []
for hit in hits:
item = by_id.get(str(hit.memory_id))
if not item:
continue
rows.append(
{
"id": str(item.id),
"memory_kind": str(item.memory_kind or ""),
"status": str(item.status or ""),
"person_id": str(item.person_id or ""),
"conversation_id": str(item.conversation_id or ""),
"content": item.content or {},
"provenance": item.provenance or {},
"confidence_score": float(item.confidence_score or 0.0),
"expires_at": item.expires_at.isoformat() if item.expires_at else "",
"last_verified_at": (
item.last_verified_at.isoformat() if item.last_verified_at else ""
),
"updated_at": item.updated_at.isoformat() if item.updated_at else "",
"search_score": float(hit.score or 0.0),
"search_summary": str(hit.summary or ""),
}
)
return rows
queryset = _base_queryset(
user_id=int(user_id),
person_id=person_id,
conversation_id=conversation_id,
statuses=statuses,
).order_by("-last_verified_at", "-updated_at")
rows = []
for item in queryset[:safe_limit]:
rows.append(
{
"id": str(item.id),
"memory_kind": str(item.memory_kind or ""),
"status": str(item.status or ""),
"person_id": str(item.person_id or ""),
"conversation_id": str(item.conversation_id or ""),
"content": item.content or {},
"provenance": item.provenance or {},
"confidence_score": float(item.confidence_score or 0.0),
"expires_at": item.expires_at.isoformat() if item.expires_at else "",
"last_verified_at": (
item.last_verified_at.isoformat() if item.last_verified_at else ""
),
"updated_at": item.updated_at.isoformat() if item.updated_at else "",
"search_score": 0.0,
"search_summary": "",
}
)
return rows

View File

@@ -137,6 +137,8 @@ class DjangoMemorySearchBackend(BaseMemorySearchBackend):
class ManticoreMemorySearchBackend(BaseMemorySearchBackend):
name = "manticore"
_table_ready_cache: dict[str, float] = {}
_table_ready_ttl_seconds = 30.0
def __init__(self):
self.base_url = str(
@@ -146,6 +148,7 @@ class ManticoreMemorySearchBackend(BaseMemorySearchBackend):
getattr(settings, "MANTICORE_MEMORY_TABLE", "gia_memory_items")
).strip() or "gia_memory_items"
self.timeout_seconds = int(getattr(settings, "MANTICORE_HTTP_TIMEOUT", 5) or 5)
self._table_cache_key = f"{self.base_url}|{self.table}"
def _sql(self, query: str) -> dict[str, Any]:
response = requests.post(
@@ -160,6 +163,9 @@ class ManticoreMemorySearchBackend(BaseMemorySearchBackend):
return dict(payload or {})
def ensure_table(self) -> None:
last_ready = float(self._table_ready_cache.get(self._table_cache_key, 0.0) or 0.0)
if (time.time() - last_ready) <= float(self._table_ready_ttl_seconds):
return
self._sql(
(
f"CREATE TABLE IF NOT EXISTS {self.table} ("
@@ -175,6 +181,7 @@ class ManticoreMemorySearchBackend(BaseMemorySearchBackend):
")"
)
)
self._table_ready_cache[self._table_cache_key] = time.time()
def _doc_id(self, memory_id: str) -> int:
digest = hashlib.blake2b(
@@ -206,11 +213,66 @@ class ManticoreMemorySearchBackend(BaseMemorySearchBackend):
)
self._sql(query)
def _build_upsert_values_clause(self, item: MemoryItem) -> str:
memory_id = str(item.id)
doc_id = self._doc_id(memory_id)
summary = _flatten_to_text(item.content)[:280]
body = _flatten_to_text(item.content)
updated_ts = int(item.updated_at.timestamp() * 1000)
return (
f"({doc_id},'{self._escape(memory_id)}',{int(item.user_id)},"
f"'{self._escape(item.conversation_id)}','{self._escape(item.memory_kind)}',"
f"'{self._escape(item.status)}',{updated_ts},"
f"'{self._escape(summary)}','{self._escape(body)}')"
)
def delete(self, memory_id: str) -> None:
self.ensure_table()
doc_id = self._doc_id(memory_id)
self._sql(f"DELETE FROM {self.table} WHERE id={doc_id}")
def reindex(
self,
*,
user_id: int | None = None,
include_statuses: tuple[str, ...] = ("active",),
limit: int = 2000,
) -> dict[str, int]:
self.ensure_table()
queryset = MemoryItem.objects.all().order_by("-updated_at")
if user_id is not None:
queryset = queryset.filter(user_id=int(user_id))
if include_statuses:
queryset = queryset.filter(status__in=list(include_statuses))
scanned = 0
indexed = 0
batch_size = 100
values: list[str] = []
for item in queryset[: max(1, int(limit))]:
scanned += 1
try:
values.append(self._build_upsert_values_clause(item))
except Exception as exc:
log.warning("memory-search upsert build failed id=%s err=%s", item.id, exc)
continue
if len(values) >= batch_size:
self._sql(
f"REPLACE INTO {self.table} "
"(id,memory_uuid,user_id,conversation_id,memory_kind,status,updated_ts,summary,body) "
f"VALUES {','.join(values)}"
)
indexed += len(values)
values = []
if values:
self._sql(
f"REPLACE INTO {self.table} "
"(id,memory_uuid,user_id,conversation_id,memory_kind,status,updated_ts,summary,body) "
f"VALUES {','.join(values)}"
)
indexed += len(values)
return {"scanned": scanned, "indexed": indexed}
def search(
self,
*,