nuonuo/experiments/exp12_lifecycle.py

"""Experiment P4: Memory Lifecycle Management.

Questions:
1. What's worth storing? (not everything in a conversation is a "memory")
2. When to forget? (access-based decay, age-based decay, capacity pressure)
3. Can we merge similar memories? (deduplification / compression)
4. Importance scoring: how to prioritize during recall and forgetting?

Strategy: implement and test each mechanism, measure impact on recall quality.
"""

import sys
import time
from pathlib import Path
from collections import Counter

import torch
import torch.nn as nn
import numpy as np

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from nuonuo.hippocampus import HippocampalMemory

DEVICE = "cuda"


def cosine(a, b):
    return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()


def load_model():
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)


def emb(model, text):
    return model.encode([text], convert_to_tensor=True,
                        normalize_embeddings=True, device=DEVICE)[0]


def test_deduplication(model):
    """Test: can we detect and merge duplicate/near-duplicate memories?"""
    print("=== Test 1: Deduplication ===\n")

    mem = HippocampalMemory(embed_dim=384)

    # Store some memories with near-duplicates
    memories = [
        ("The database is slow", "Check missing indexes"),
        ("Database is really slow today", "Check missing indexes on users table"),  # near-dup
        ("DB performance is terrible", "Look at index usage"),  # near-dup
        ("Deploy to production", "Use blue-green deployment"),
        ("Push to prod", "Blue-green deployment via GitHub Actions"),  # near-dup
        ("The API returns 500 errors", "Check for OOM in Python worker"),
        ("Getting 500 errors from API", "Python worker might be OOM"),  # near-dup
        ("Set up monitoring", "Prometheus + Grafana"),
        ("We need better observability", "Set up Prometheus and Grafana"),  # near-dup
    ]

    for cue, target in memories:
        mem.store(emb(model, cue), emb(model, target),
                  metadata={"cue": cue, "target": target})

    print(f"  Before dedup: {len(mem.memories)} memories")

    # Detect near-duplicates by cue similarity
    entries = list(mem.memories.values())
    groups = []
    used = set()

    for i, e1 in enumerate(entries):
        if i in used:
            continue
        group = [i]
        for j, e2 in enumerate(entries):
            if j <= i or j in used:
                continue
            sim = cosine(e1.cue_embedding, e2.cue_embedding)
            if sim > 0.7:  # threshold for "near-duplicate"
                group.append(j)
                used.add(j)
        groups.append(group)
        used.add(i)

    print(f"  Found {len(groups)} groups (from {len(entries)} memories):")
    for group in groups:
        if len(group) > 1:
            cues = [entries[i].metadata.get("cue", "?") for i in group]
            print(f"    Group ({len(group)}): {[c[:30] for c in cues]}")

    # Merge: keep the one with longest target (most info)
    to_remove = []
    for group in groups:
        if len(group) > 1:
            # Keep the one with longest target text
            best = max(group, key=lambda i: len(entries[i].metadata.get("target", "")))
            for i in group:
                if i != best:
                    to_remove.append(entries[i].memory_id)

    for mid in to_remove:
        mem.forget(mid)

    print(f"  After dedup: {len(mem.memories)} memories")
    print(f"  Removed {len(to_remove)} duplicates")


def test_importance_scoring(model):
    """Test: importance-based memory management."""
    print("\n=== Test 2: Importance Scoring ===\n")

    # Simulate conversation with varying importance
    conversations = [
        # (user, assistant, expected_importance)
        ("Hi there!", "Hello! How can I help?", "low"),
        ("What's the weather?", "It's sunny today.", "low"),
        ("The production database crashed at 3am", "Emergency: restore from latest backup at s3://backups/db-latest.sql", "high"),
        ("What time is it?", "It's 3:45 PM.", "low"),
        ("The auth service JWT secret was compromised", "Rotate secret immediately: kubectl set env deployment/auth JWT_SECRET=new_value", "critical"),
        ("Deploy the hotfix", "Deployed via GitHub Actions, monitor Grafana for 30 min", "high"),
        ("Thanks for your help", "You're welcome!", "low"),
    ]

    def score_importance(user_msg, assistant_msg):
        """Simple heuristic importance scoring."""
        score = 0.3  # base

        # Length suggests complexity
        if len(assistant_msg.split()) > 15:
            score += 0.2

        # Technical keywords
        critical_words = ["crash", "emergency", "compromised", "secret", "password",
                          "production", "outage", "down", "data loss"]
        high_words = ["deploy", "config", "fix", "bug", "error", "migrate",
                      "backup", "restore", "rollback"]
        for w in critical_words:
            if w in (user_msg + assistant_msg).lower():
                score += 0.3
        for w in high_words:
            if w in (user_msg + assistant_msg).lower():
                score += 0.1

        # Questions suggest retrievable info
        if "?" in user_msg:
            score += 0.1

        return min(score, 1.0)

    for user, assistant, expected in conversations:
        score = score_importance(user, assistant)
        status = "✓" if (expected == "low" and score < 0.5) or \
                        (expected == "high" and 0.5 <= score < 0.8) or \
                        (expected == "critical" and score >= 0.8) else "✗"
        should_store = score >= 0.4
        print(f"  {status} [{score:.2f}] {'STORE' if should_store else 'SKIP ':>5} "
              f"({expected:>8}) '{user[:40]}...'")


def test_forgetting_strategies(model):
    """Test: different forgetting strategies under memory pressure."""
    print("\n=== Test 3: Forgetting Strategies ===\n")

    # Simulate 7 days of memories, each day 10 memories
    days = 7
    per_day = 10
    max_capacity = 30  # Force forgetting after 30 memories

    cue_template = "Day {day} task {i}: {topic}"
    target_template = "Solution for day {day} task {i}"
    topics = ["database", "deploy", "monitoring", "auth", "API",
              "caching", "logging", "testing", "docker", "CI/CD"]

    def run_strategy(strategy_name, forget_fn):
        mem = HippocampalMemory(embed_dim=384)
        day_memories = {}  # day → list of memory_ids

        for day in range(1, days + 1):
            day_memories[day] = []
            for i in range(per_day):
                cue = cue_template.format(day=day, i=i, topic=topics[i])
                target = target_template.format(day=day, i=i)
                mid = mem.store(emb(model, cue), emb(model, target),
                                metadata={"day": day, "task": i},
                                timestamp=float(day))
                day_memories[day].append(mid)

            # Check capacity
            if len(mem.memories) > max_capacity:
                forget_fn(mem, max_capacity)

        # Test recall for each day's memories
        day_recall = {}
        for day in range(1, days + 1):
            correct = 0
            total = 0
            for i in range(per_day):
                mid = day_memories[day][i] if i < len(day_memories[day]) else None
                if mid is None or mid not in mem.memories:
                    continue
                cue = cue_template.format(day=day, i=i, topic=topics[i])
                results = mem.recall(emb(model, cue), top_k=1)
                if results and results[0].memory_id == mid:
                    correct += 1
                total += 1
            day_recall[day] = (correct, total)

        # Print results
        surviving = len(mem.memories)
        print(f"  {strategy_name}: {surviving} memories surviving")
        for day in range(1, days + 1):
            c, t = day_recall[day]
            pct = f"{c}/{t}" if t > 0 else "0/0"
            print(f"    Day {day}: {pct}")

    # Strategy 1: FIFO (oldest first)
    def forget_fifo(mem, cap):
        entries = sorted(mem.memories.values(), key=lambda e: e.timestamp)
        to_remove = len(mem.memories) - cap
        for e in entries[:to_remove]:
            mem.forget(e.memory_id)

    # Strategy 2: LRU (least recently accessed)
    def forget_lru(mem, cap):
        entries = sorted(mem.memories.values(), key=lambda e: e.access_count)
        to_remove = len(mem.memories) - cap
        for e in entries[:to_remove]:
            mem.forget(e.memory_id)

    # Strategy 3: Low importance first (by timestamp recency as proxy)
    def forget_low_importance(mem, cap):
        entries = sorted(mem.memories.values(),
                        key=lambda e: e.timestamp + e.access_count * 0.5)
        to_remove = len(mem.memories) - cap
        for e in entries[:to_remove]:
            mem.forget(e.memory_id)

    print("(max_capacity=30, 7 days × 10 memories = 70 total)")
    run_strategy("FIFO (oldest first)", forget_fifo)
    print()
    run_strategy("LRU (least accessed)", forget_lru)
    print()
    run_strategy("Importance (recency+access)", forget_low_importance)


def main():
    print("=" * 60)
    print("Experiment P4: Memory Lifecycle")
    print("=" * 60)

    model = load_model()
    test_deduplication(model)
    test_importance_scoring(model)
    test_forgetting_strategies(model)


if __name__ == "__main__":
    main()