add nocmem: auto memory recall + ingest via NuoNuo hippocampal network

- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints - NOC integration: auto recall after user message (injected as system msg), async ingest after LLM response (fire-and-forget) - Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32), top_k=3, KV-cache friendly (appended after user msg, not in system prompt) - Ingest: LLM extraction + paraphrase augmentation, heuristic fallback - Wired into main.rs, life.rs (agent done), http.rs (api chat) - Config: optional `nocmem.endpoint` in config.yaml - Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale - Design doc: doc/nocmem.md
2026-04-11 12:24:48 +01:00
parent 688387dac3
commit 7000ccda0f
17 changed files with 4164 additions and 3 deletions
--- a/mem/benchmarks/efficiency_bench.py
+++ b/mem/benchmarks/efficiency_bench.py
@@ -0,0 +1,345 @@
+"""Efficiency benchmark for nocmem vs ChromaDB baseline.
+
+Measures: storage size, memory usage, query latency, ingest throughput
+at various scales (100, 1K, 5K, 10K, 20K memories).
+
+Usage:
+    uv run python benchmarks/efficiency_bench.py
+"""
+
+import gc
+import os
+import json
+import shutil
+import tempfile
+import time
+
+import torch
+import psutil
+from sentence_transformers import SentenceTransformer
+
+from nuonuo.hippocampus import HippocampalMemory
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+EMBED_MODEL = "all-MiniLM-L6-v2"
+EMBED_DIM = 384
+
+DATA_FILE = "benchmarks/longmemeval.json"
+
+# ── helpers ─────────────────────────────────────────────────────────
+
+def get_process_mem_mb():
+    return psutil.Process(os.getpid()).memory_info().rss / 1024**2
+
+def get_gpu_mem_mb():
+    if DEVICE != "cuda":
+        return 0.0
+    return torch.cuda.memory_allocated() / 1024**2
+
+def file_size_mb(path):
+    if os.path.exists(path):
+        return os.path.getsize(path) / 1024**2
+    return 0.0
+
+def dir_size_mb(path):
+    total = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            total += os.path.getsize(os.path.join(dirpath, f))
+    return total / 1024**2
+
+
+# ── extract chunks from LongMemEval ────────────────────────────────
+
+def load_chunks(max_chunks=25000):
+    """Extract turn-level chunks from LongMemEval data."""
+    with open(DATA_FILE) as f:
+        data = json.load(f)
+
+    chunks = []
+    seen = set()
+    for item in data:
+        for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
+            for i in range(0, len(sess) - 1, 2):
+                key = (sid, i)
+                if key in seen:
+                    continue
+                seen.add(key)
+                user = sess[i]["content"]
+                asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
+                text = f"{user}\n{asst}"[:1000]
+                chunks.append(text)
+                if len(chunks) >= max_chunks:
+                    return chunks
+    return chunks
+
+
+# ── nocmem benchmark ────────────────────────────────────────────────
+
+def bench_nocmem(encoder, chunks, n, query_texts):
+    """Benchmark nocmem at scale n."""
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    subset = chunks[:n]
+    gpu_before = get_gpu_mem_mb()
+    ram_before = get_process_mem_mb()
+
+    # batch embed
+    t0 = time.monotonic()
+    embeddings = encoder.encode(
+        subset, convert_to_tensor=True, normalize_embeddings=True,
+        device=DEVICE, batch_size=256, show_progress_bar=False,
+    )
+    embed_time = time.monotonic() - t0
+
+    # store
+    hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
+    t1 = time.monotonic()
+    for i in range(n):
+        hip.store(embeddings[i], embeddings[i], metadata={"id": i})
+    store_time = time.monotonic() - t1
+
+    gpu_after = get_gpu_mem_mb()
+    ram_after = get_process_mem_mb()
+
+    # save to measure file size
+    tmp = tempfile.mktemp(suffix=".pt")
+    hip.save(tmp)
+    disk_mb = file_size_mb(tmp)
+    os.unlink(tmp)
+
+    # query latency — multiple queries, measure p50/p99
+    query_embs = encoder.encode(
+        query_texts, convert_to_tensor=True, normalize_embeddings=True,
+        device=DEVICE, show_progress_bar=False,
+    )
+    latencies = []
+    for qe in query_embs:
+        t = time.monotonic()
+        hip.recall(qe, top_k=5)
+        latencies.append((time.monotonic() - t) * 1000)
+
+    latencies.sort()
+    p50 = latencies[len(latencies) // 2]
+    p99 = latencies[int(len(latencies) * 0.99)]
+    avg = sum(latencies) / len(latencies)
+
+    # cleanup
+    del hip, embeddings
+    torch.cuda.empty_cache()
+
+    return {
+        "n": n,
+        "embed_time_s": embed_time,
+        "store_time_s": store_time,
+        "ingest_rate": n / (embed_time + store_time),  # memories/sec
+        "disk_mb": disk_mb,
+        "gpu_delta_mb": gpu_after - gpu_before,
+        "ram_delta_mb": ram_after - ram_before,
+        "latency_avg_ms": avg,
+        "latency_p50_ms": p50,
+        "latency_p99_ms": p99,
+    }
+
+
+# ── chromadb benchmark ──────────────────────────────────────────────
+
+def bench_chromadb(encoder, chunks, n, query_texts):
+    """Benchmark ChromaDB (MemPalace's backend) at scale n."""
+    import chromadb
+
+    subset = chunks[:n]
+    ram_before = get_process_mem_mb()
+
+    tmpdir = tempfile.mkdtemp()
+    client = chromadb.PersistentClient(path=tmpdir)
+    collection = client.create_collection(
+        name="bench",
+        metadata={"hnsw:space": "cosine"},
+    )
+
+    # embed
+    t0 = time.monotonic()
+    embeddings_np = encoder.encode(
+        subset, normalize_embeddings=True,
+        batch_size=256, show_progress_bar=False,
+    )
+    embed_time = time.monotonic() - t0
+
+    # store — chromadb takes numpy/list
+    t1 = time.monotonic()
+    batch = 5000
+    for start in range(0, n, batch):
+        end = min(start + batch, n)
+        collection.add(
+            ids=[str(i) for i in range(start, end)],
+            embeddings=embeddings_np[start:end].tolist(),
+            documents=subset[start:end],
+        )
+    store_time = time.monotonic() - t1
+
+    ram_after = get_process_mem_mb()
+    disk_mb = dir_size_mb(tmpdir)
+
+    # query latency
+    query_np = encoder.encode(
+        query_texts, normalize_embeddings=True, show_progress_bar=False,
+    )
+    latencies = []
+    for qe in query_np:
+        t = time.monotonic()
+        collection.query(query_embeddings=[qe.tolist()], n_results=5)
+        latencies.append((time.monotonic() - t) * 1000)
+
+    latencies.sort()
+    p50 = latencies[len(latencies) // 2]
+    p99 = latencies[int(len(latencies) * 0.99)]
+    avg = sum(latencies) / len(latencies)
+
+    # cleanup
+    del client, collection
+    shutil.rmtree(tmpdir)
+
+    return {
+        "n": n,
+        "embed_time_s": embed_time,
+        "store_time_s": store_time,
+        "ingest_rate": n / (embed_time + store_time),
+        "disk_mb": disk_mb,
+        "gpu_delta_mb": 0,
+        "ram_delta_mb": ram_after - ram_before,
+        "latency_avg_ms": avg,
+        "latency_p50_ms": p50,
+        "latency_p99_ms": p99,
+    }
+
+
+# ── main ────────────────────────────────────────────────────────────
+
+def main():
+    print("nocmem efficiency benchmark")
+    print(f"device: {DEVICE}")
+    print()
+
+    # check chromadb available
+    has_chromadb = False
+    try:
+        import chromadb
+        has_chromadb = True
+        print("chromadb: available (will compare)")
+    except ImportError:
+        print("chromadb: not installed (nocmem only)")
+    print()
+
+    print("loading data...")
+    chunks = load_chunks(25000)
+    print(f"  {len(chunks)} unique chunks extracted")
+
+    print("loading encoder...")
+    encoder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
+
+    # query texts — mix of English and Chinese
+    query_texts = [
+        "What degree did I graduate with?",
+        "How to deploy the application?",
+        "What was the database error we fixed last week?",
+        "Tell me about the meeting schedule",
+        "What programming language should I learn?",
+        "数据库密码在哪里",
+        "部署到生产环境的步骤",
+        "上次讨论的性能优化方案",
+        "项目的技术栈是什么",
+        "最近的待办事项有哪些",
+        "How do I configure the server?",
+        "What's the API endpoint for user authentication?",
+        "Can you recommend some books on machine learning?",
+        "What was the root cause of the production incident?",
+        "How much memory does the GPU have?",
+        "VR设备的兼容性问题",
+        "模型推理的延迟是多少",
+        "代码仓库的结构是怎样的",
+        "如何解决内存泄漏",
+        "上次会议的结论是什么",
+    ]
+
+    scales = [100, 500, 1000, 5000, 10000, 20000]
+    # filter to what we have
+    scales = [s for s in scales if s <= len(chunks)]
+
+    nocmem_results = []
+    chroma_results = []
+
+    for n in scales:
+        print(f"\n── scale: {n:,} memories ──")
+
+        print(f"  nocmem...", end="", flush=True)
+        r = bench_nocmem(encoder, chunks, n, query_texts)
+        nocmem_results.append(r)
+        print(f" done  (R: {r['latency_avg_ms']:.1f}ms, disk: {r['disk_mb']:.1f}MB)")
+
+        if has_chromadb:
+            print(f"  chromadb...", end="", flush=True)
+            r2 = bench_chromadb(encoder, chunks, n, query_texts)
+            chroma_results.append(r2)
+            print(f" done  (R: {r2['latency_avg_ms']:.1f}ms, disk: {r2['disk_mb']:.1f}MB)")
+
+    # ── report ──────────────────────────────────────────────────────
+
+    print(f"\n{'='*80}")
+    print(f"EFFICIENCY BENCHMARK RESULTS")
+    print(f"{'='*80}")
+
+    # table header
+    if has_chromadb:
+        print(f"\n{'Scale':>8} | {'--- nocmem ---':^40} | {'--- ChromaDB ---':^40}")
+        print(f"{'':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Rate':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'RAM':>8} {'Rate':>8}")
+        print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
+        print("-" * 100)
+        for nm, cr in zip(nocmem_results, chroma_results):
+            print(
+                f"{nm['n']:>8,} | "
+                f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f} | "
+                f"{cr['latency_avg_ms']:>8.1f} {cr['latency_p99_ms']:>8.1f} {cr['disk_mb']:>8.1f} {cr['ram_delta_mb']:>8.1f} {cr['ingest_rate']:>8.0f}"
+            )
+    else:
+        print(f"\n{'Scale':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Ingest':>8}")
+        print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
+        print("-" * 60)
+        for nm in nocmem_results:
+            print(
+                f"{nm['n']:>8,} | "
+                f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f}"
+            )
+
+    # summary
+    if nocmem_results:
+        biggest = nocmem_results[-1]
+        print(f"\nnocmem @ {biggest['n']:,}:")
+        print(f"  Query latency:  avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
+        print(f"  Disk:           {biggest['disk_mb']:.1f} MB")
+        print(f"  VRAM delta:     {biggest['gpu_delta_mb']:.1f} MB")
+        print(f"  Ingest rate:    {biggest['ingest_rate']:.0f} memories/sec")
+
+    if chroma_results:
+        biggest = chroma_results[-1]
+        print(f"\nChromaDB @ {biggest['n']:,}:")
+        print(f"  Query latency:  avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
+        print(f"  Disk:           {biggest['disk_mb']:.1f} MB")
+        print(f"  RAM delta:      {biggest['ram_delta_mb']:.1f} MB")
+        print(f"  Ingest rate:    {biggest['ingest_rate']:.0f} memories/sec")
+
+    if has_chromadb and nocmem_results and chroma_results:
+        nm = nocmem_results[-1]
+        cr = chroma_results[-1]
+        print(f"\n── nocmem vs ChromaDB @ {nm['n']:,} ──")
+        lat_ratio = cr['latency_avg_ms'] / nm['latency_avg_ms'] if nm['latency_avg_ms'] > 0 else float('inf')
+        disk_ratio = cr['disk_mb'] / nm['disk_mb'] if nm['disk_mb'] > 0 else float('inf')
+        rate_ratio = nm['ingest_rate'] / cr['ingest_rate'] if cr['ingest_rate'] > 0 else float('inf')
+        print(f"  Latency:  nocmem {lat_ratio:.1f}x faster" if lat_ratio > 1 else f"  Latency:  ChromaDB {1/lat_ratio:.1f}x faster")
+        print(f"  Disk:     nocmem {disk_ratio:.1f}x smaller" if disk_ratio > 1 else f"  Disk:     ChromaDB {1/disk_ratio:.1f}x smaller")
+        print(f"  Ingest:   nocmem {rate_ratio:.1f}x faster" if rate_ratio > 1 else f"  Ingest:   ChromaDB {1/rate_ratio:.1f}x faster")
+
+
+if __name__ == "__main__":
+    main()
--- a/mem/benchmarks/longmemeval_bench.py
+++ b/mem/benchmarks/longmemeval_bench.py
@@ -0,0 +1,239 @@
+"""LongMemEval benchmark for nocmem.
+
+Evaluates retrieval quality: given a question, can nocmem find the correct
+session(s) from a haystack of ~50 conversation sessions?
+
+Uses HippocampalMemory directly (no HTTP) for speed.
+Compares against MemPalace's 96.6% R@5 baseline.
+
+Usage:
+    uv run python benchmarks/longmemeval_bench.py [--limit N] [--granularity session|turn]
+"""
+
+import argparse
+import json
+import math
+import sys
+import time
+
+import torch
+from sentence_transformers import SentenceTransformer
+
+from nuonuo.hippocampus import HippocampalMemory
+
+# ── setup ───────────────────────────────────────────────────────────
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+EMBED_MODEL = "all-MiniLM-L6-v2"
+EMBED_DIM = 384
+
+
+def load_encoder():
+    print(f"loading {EMBED_MODEL} on {DEVICE}...")
+    return SentenceTransformer(EMBED_MODEL, device=DEVICE)
+
+
+def embed_batch(encoder, texts: list[str]) -> torch.Tensor:
+    """Batch embed, returns (N, dim) tensor."""
+    return encoder.encode(
+        texts, convert_to_tensor=True, normalize_embeddings=True,
+        device=DEVICE, batch_size=128, show_progress_bar=False,
+    )
+
+
+# ── granularity: how to chunk sessions ──────────────────────────────
+
+def sessions_to_chunks_turn(session_ids, sessions):
+    """Each user-assistant turn becomes a separate chunk."""
+    chunks = []  # (text, session_id)
+    for sid, sess in zip(session_ids, sessions):
+        for i in range(0, len(sess) - 1, 2):
+            user = sess[i]["content"]
+            asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
+            text = f"{user}\n{asst}"
+            # truncate long turns to avoid embedding issues
+            chunks.append((text[:1000], sid))
+        # handle odd-numbered turns
+        if len(sess) % 2 == 1:
+            chunks.append((sess[-1]["content"][:1000], sid))
+    return chunks
+
+
+def sessions_to_chunks_session(session_ids, sessions):
+    """Each session becomes a single chunk (concatenated turns)."""
+    chunks = []
+    for sid, sess in zip(session_ids, sessions):
+        text = "\n".join(m["content"] for m in sess)
+        # truncate to fit embedding model's context
+        chunks.append((text[:2000], sid))
+    return chunks
+
+
+# ── evaluate one question ───────────────────────────────────────────
+
+def evaluate_question(encoder, item, granularity, ks=(5, 10)):
+    """Store haystack, query, check if answer session in top-K.
+
+    Returns dict with R@5, R@10, NDCG@10, timings.
+    """
+    # chunk the haystack
+    if granularity == "turn":
+        chunks = sessions_to_chunks_turn(
+            item["haystack_session_ids"], item["haystack_sessions"])
+    else:
+        chunks = sessions_to_chunks_session(
+            item["haystack_session_ids"], item["haystack_sessions"])
+
+    texts = [c[0] for c in chunks]
+    sids = [c[1] for c in chunks]
+    answer_sids = set(item["answer_session_ids"])
+
+    # batch embed all chunks
+    t0 = time.monotonic()
+    embeddings = embed_batch(encoder, texts)
+    embed_time = time.monotonic() - t0
+
+    # build memory
+    t1 = time.monotonic()
+    hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
+    for i in range(len(chunks)):
+        hip.store(
+            embeddings[i], embeddings[i],
+            metadata={"session_id": sids[i]},
+        )
+    store_time = time.monotonic() - t1
+
+    # query
+    t2 = time.monotonic()
+    query_emb = encoder.encode(
+        [item["question"]], convert_to_tensor=True,
+        normalize_embeddings=True, device=DEVICE,
+    )[0]
+
+    max_k = max(ks)
+    results = hip.recall(query_emb, top_k=max_k)
+    recall_time = time.monotonic() - t2
+
+    # deduplicate by session_id, preserving rank order
+    seen = set()
+    ranked_sids = []
+    for r in results:
+        sid = r.metadata["session_id"]
+        if sid not in seen:
+            seen.add(sid)
+            ranked_sids.append(sid)
+
+    # compute metrics
+    metrics = {}
+    for k in ks:
+        top_k_sids = set(ranked_sids[:k])
+        hit = bool(answer_sids & top_k_sids)
+        metrics[f"R@{k}"] = 1.0 if hit else 0.0
+
+    # NDCG@10
+    ndcg = compute_ndcg(ranked_sids[:10], answer_sids)
+    metrics["NDCG@10"] = ndcg
+
+    metrics["embed_ms"] = embed_time * 1000
+    metrics["store_ms"] = store_time * 1000
+    metrics["recall_ms"] = recall_time * 1000
+    metrics["n_chunks"] = len(chunks)
+
+    return metrics
+
+
+def compute_ndcg(ranked_sids, answer_sids, k=10):
+    """Normalized Discounted Cumulative Gain."""
+    dcg = 0.0
+    for i, sid in enumerate(ranked_sids[:k]):
+        if sid in answer_sids:
+            dcg += 1.0 / math.log2(i + 2)  # i+2 because rank starts at 1
+
+    # ideal: all answer sessions at top
+    n_relevant = min(len(answer_sids), k)
+    idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant))
+
+    return dcg / idcg if idcg > 0 else 0.0
+
+
+# ── main ───<E29480><E29480>────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data", default="benchmarks/longmemeval.json")
+    parser.add_argument("--limit", type=int, default=0, help="limit number of questions (0=all)")
+    parser.add_argument("--granularity", choices=["session", "turn"], default="turn")
+    args = parser.parse_args()
+
+    print(f"LongMemEval benchmark for nocmem")
+    print(f"granularity: {args.granularity}")
+    print(f"device: {DEVICE}")
+    print()
+
+    with open(args.data) as f:
+        data = json.load(f)
+
+    if args.limit:
+        data = data[:args.limit]
+
+    encoder = load_encoder()
+
+    print(f"evaluating {len(data)} questions...\n")
+
+    all_metrics = []
+    by_type = {}
+
+    for i, item in enumerate(data):
+        metrics = evaluate_question(encoder, item, args.granularity)
+        all_metrics.append(metrics)
+
+        qtype = item["question_type"]
+        if qtype not in by_type:
+            by_type[qtype] = []
+        by_type[qtype].append(metrics)
+
+        # progress
+        if (i + 1) % 10 == 0 or i == len(data) - 1:
+            r5 = sum(m["R@5"] for m in all_metrics) / len(all_metrics) * 100
+            r10 = sum(m["R@10"] for m in all_metrics) / len(all_metrics) * 100
+            avg_recall = sum(m["recall_ms"] for m in all_metrics) / len(all_metrics)
+            print(f"  [{i+1:3d}/{len(data)}] R@5={r5:.1f}%  R@10={r10:.1f}%  recall={avg_recall:.1f}ms")
+
+    # final results
+    n = len(all_metrics)
+    r5 = sum(m["R@5"] for m in all_metrics) / n * 100
+    r10 = sum(m["R@10"] for m in all_metrics) / n * 100
+    ndcg = sum(m["NDCG@10"] for m in all_metrics) / n * 100
+    avg_embed = sum(m["embed_ms"] for m in all_metrics) / n
+    avg_store = sum(m["store_ms"] for m in all_metrics) / n
+    avg_recall = sum(m["recall_ms"] for m in all_metrics) / n
+    avg_chunks = sum(m["n_chunks"] for m in all_metrics) / n
+
+    print(f"\n{'='*60}")
+    print(f"nocmem LongMemEval Results ({args.granularity} granularity)")
+    print(f"{'='*60}")
+    print(f"  Questions:     {n}")
+    print(f"  Avg chunks:    {avg_chunks:.0f}")
+    print(f"")
+    print(f"  R@5:           {r5:.1f}%")
+    print(f"  R@10:          {r10:.1f}%")
+    print(f"  NDCG@10:       {ndcg:.1f}%")
+    print(f"")
+    print(f"  Avg embed:     {avg_embed:.0f}ms")
+    print(f"  Avg store:     {avg_store:.0f}ms")
+    print(f"  Avg recall:    {avg_recall:.1f}ms")
+
+    print(f"\n── by question type ──")
+    for qtype, ms in sorted(by_type.items()):
+        nt = len(ms)
+        tr5 = sum(m["R@5"] for m in ms) / nt * 100
+        tr10 = sum(m["R@10"] for m in ms) / nt * 100
+        print(f"  {qtype:30s}  n={nt:3d}  R@5={tr5:.1f}%  R@10={tr10:.1f}%")
+
+    print(f"\n── comparison ──")
+    print(f"  MemPalace (raw, session):   R@5=96.6%")
+    print(f"  nocmem ({args.granularity:7s}):         R@5={r5:.1f}%")
+
+
+if __name__ == "__main__":
+    main()
--- a/mem/benchmarks/noise_vs_scale.py
+++ b/mem/benchmarks/noise_vs_scale.py
@@ -0,0 +1,178 @@
+"""Does recall noise decrease as memory count grows?
+
+At various scales, measure:
+1. Recall accuracy (R@3) for relevant queries
+2. Max cosine similarity for irrelevant queries
+3. Separation gap between relevant and irrelevant
+
+If nocmem works well at scale, the gap should widen — relevant queries
+should score much higher than irrelevant ones as the memory pool grows.
+"""
+
+import json
+import time
+import torch
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from nuonuo.hippocampus import HippocampalMemory
+
+DEVICE = "cuda"
+EMBED_DIM = 384
+DATA_FILE = "benchmarks/longmemeval.json"
+
+IRRELEVANT_QUERIES = [
+    "今天天气怎么样",
+    "你喜欢吃什么",
+    "嗨",
+    "讲个笑话",
+    "明天会下雨吗",
+    "你觉得猫可爱还是狗可爱",
+    "人生的意义是什么",
+    "帮我写一首诗",
+    "地球到月球有多远",
+    "如何学会游泳",
+]
+
+BETA_CONFIGS = [16.0, 32.0, 64.0]
+SCALES = [50, 200, 500, 1000, 3000]
+
+
+def main():
+    print("noise vs scale benchmark\n")
+    print("loading encoder...")
+    encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
+
+    def emb(text):
+        return encoder.encode([text], convert_to_tensor=True,
+                              normalize_embeddings=True, device=DEVICE)[0]
+
+    def emb_batch(texts):
+        return encoder.encode(texts, convert_to_tensor=True,
+                              normalize_embeddings=True, device=DEVICE,
+                              batch_size=256, show_progress_bar=False)
+
+    # load data
+    print("loading data...")
+    with open(DATA_FILE) as f:
+        data = json.load(f)
+
+    # collect unique chunks with their source question index
+    all_chunks = []  # (text, question_idx, session_id)
+    seen = set()
+    for qi, item in enumerate(data):
+        for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
+            for i in range(0, len(sess) - 1, 2):
+                key = (sid, i)
+                if key in seen:
+                    continue
+                seen.add(key)
+                user = sess[i]["content"]
+                asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
+                text = f"{user}\n{asst}"[:1000]
+                all_chunks.append((text, qi, sid))
+    print(f"  {len(all_chunks)} unique chunks")
+
+    # pre-embed irrelevant queries
+    irrel_embs = [emb(q) for q in IRRELEVANT_QUERIES]
+
+    # collect relevant queries: for each question, we know the answer session
+    # pick first 50 questions that have at least one answer session
+    relevant_queries = []
+    for item in data[:100]:
+        answer_sids = set(item["answer_session_ids"])
+        relevant_queries.append((item["question"], answer_sids))
+    rel_query_embs = emb_batch([q for q, _ in relevant_queries])
+
+    print(f"  {len(relevant_queries)} relevant queries")
+    print(f"  {len(IRRELEVANT_QUERIES)} irrelevant queries")
+
+    # filter scales to what we have
+    scales = [s for s in SCALES if s <= len(all_chunks)]
+
+    for beta in BETA_CONFIGS:
+        print(f"\n{'='*70}")
+        print(f"  β = {beta}")
+        print(f"{'='*70}")
+        print(f"{'Scale':>7} | {'R@3':>6} | {'Rel maxcos':>10} {'Irrel maxcos':>12} {'Gap':>8} | {'Rel attn':>9} {'Irrel attn':>11}")
+        print("-" * 80)
+
+        for n in scales:
+            subset = all_chunks[:n]
+            texts = [c[0] for c in subset]
+            sids = [c[2] for c in subset]
+
+            # embed and build memory
+            embeddings = emb_batch(texts)
+            hip = HippocampalMemory(
+                embed_dim=EMBED_DIM, beta=beta, hopfield_top_k=10, device=DEVICE,
+            )
+            for i in range(n):
+                hip.store(embeddings[i], embeddings[i],
+                          metadata={"session_id": sids[i]})
+
+            cue_mat = hip._get_cue_matrix()
+
+            # --- relevant queries ---
+            rel_max_cos = []
+            rel_top_attn = []
+            hits = 0
+            tested = 0
+
+            for qi in range(len(relevant_queries)):
+                question, answer_sids = relevant_queries[qi]
+                qe = rel_query_embs[qi]
+
+                # check if any answer session is in this subset
+                subset_sids = set(sids)
+                if not (answer_sids & subset_sids):
+                    continue
+                tested += 1
+
+                # cosine sim
+                cos_sims = qe @ cue_mat.T
+                rel_max_cos.append(cos_sims.max().item())
+
+                # recall
+                results = hip.recall(qe, top_k=3)
+                top_attn = results[0].similarity if results else 0
+                rel_top_attn.append(top_attn)
+
+                recalled_sids = {r.metadata["session_id"] for r in results}
+                if answer_sids & recalled_sids:
+                    hits += 1
+
+            r3 = hits / tested * 100 if tested > 0 else 0
+            avg_rel_cos = np.mean(rel_max_cos) if rel_max_cos else 0
+            avg_rel_attn = np.mean(rel_top_attn) if rel_top_attn else 0
+
+            # --- irrelevant queries ---
+            irrel_max_cos = []
+            irrel_top_attn = []
+            for qe in irrel_embs:
+                cos_sims = qe @ cue_mat.T
+                irrel_max_cos.append(cos_sims.max().item())
+
+                results = hip.recall(qe, top_k=3)
+                top_attn = results[0].similarity if results else 0
+                irrel_top_attn.append(top_attn)
+
+            avg_irrel_cos = np.mean(irrel_max_cos)
+            avg_irrel_attn = np.mean(irrel_top_attn)
+
+            gap = avg_rel_cos - avg_irrel_cos
+
+            print(f"{n:>7,} | {r3:>5.1f}% | {avg_rel_cos:>10.3f} {avg_irrel_cos:>12.3f} {gap:>8.3f} | {avg_rel_attn:>8.0%} {avg_irrel_attn:>10.0%}")
+
+            del hip
+            torch.cuda.empty_cache()
+
+    print(f"\n── 解读 ──")
+    print(f"Rel maxcos:   相关查询的最大余弦相似度（越高越好）")
+    print(f"Irrel maxcos: 无关查询的最大余弦相似度（越低越好）")
+    print(f"Gap:          两者之差（越大越好 = 越容易区分）")
+    print(f"Rel attn:     相关查询 top1 的 Hopfield attention 权重")
+    print(f"Irrel attn:   无关查询 top1 的 Hopfield attention 权重（越低 = 越少噪音）")
+
+
+if __name__ == "__main__":
+    main()
--- a/mem/benchmarks/sharpness_test.py
+++ b/mem/benchmarks/sharpness_test.py
@@ -0,0 +1,104 @@
+"""Test Hopfield attention sharpness with different top_k and beta.
+
+Goal: find settings that give "either clearly remembered or nothing"
+instead of flat attention across 20 candidates.
+"""
+
+import torch
+from sentence_transformers import SentenceTransformer
+from nuonuo.hippocampus import HippocampalMemory
+
+DEVICE = "cuda"
+EMBED_DIM = 384
+
+print("loading encoder...")
+encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
+
+def emb(text):
+    return encoder.encode([text], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0]
+
+
+# store the same memories in each config
+MEMORIES = [
+    ("bot的名字叫什么", "bot的名字叫小乖，是Fam给取的"),
+    ("有哪些工具可以用", "工具有: fam_todo, send_file, spawn_agent, run_shell, run_python, update_memory"),
+    ("vLLM在5090上的性能", "RTX 5090上vLLM跑gemma只有4.8 tok/s，需要切换到awq_marlin"),
+    ("repo-vis项目是什么", "repo-vis用Rust后端+Three.js前端的3D代码库可视化，目标支持Linux内核和Pico VR"),
+    ("repo-vis的性能瓶颈", "Linux内核79K文件，SQLite 1GB上限和O(n)反序列化是瓶颈，需要n-ary tree按需合并"),
+    ("明天的待办事项", "最紧迫的是emblem scanner的AI Chat和KB部分"),
+    ("后端切换到了什么", "NOC后端切换到了vLLM，速度变快了"),
+    ("数据库密码在哪里", "数据库密码存在 /etc/secrets/db.env 文件中"),
+    ("什么GPU", "服务器有NVIDIA RTX 4090 24GB VRAM"),
+    ("home有多少log文件", "home目录及子目录下共有960个.log文件"),
+]
+
+QUERIES = [
+    ("repo-vis怎么样了", "repo-vis", True),      # should recall clearly
+    ("数据库密码", "密码", True),                   # should recall clearly
+    ("今天天气怎么样", "天气", False),              # irrelevant, should recall nothing
+    ("vllm速度", "vllm", True),                    # should recall clearly
+    ("你喜欢吃什么", "吃什么", False),              # irrelevant
+    ("VR支持", "VR", True),                        # edge case
+]
+
+CONFIGS = [
+    # (top_k, beta, label)
+    (20, 16.0, "baseline (top_k=20, β=16)"),
+    (10, 16.0, "top_k=10, β=16"),
+    (5,  16.0, "top_k=5,  β=16"),
+    (20, 32.0, "top_k=20, β=32"),
+    (20, 64.0, "top_k=20, β=64"),
+    (10, 32.0, "top_k=10, β=32"),
+    (5,  32.0, "top_k=5,  β=32"),
+    (5,  64.0, "top_k=5,  β=64"),
+]
+
+# pre-embed everything
+mem_embs = [(emb(c), emb(t), c, t) for c, t in MEMORIES]
+query_embs = [(emb(q), label, relevant) for q, label, relevant in QUERIES]
+
+print(f"\n{len(MEMORIES)} memories, {len(QUERIES)} queries, {len(CONFIGS)} configs\n")
+
+for top_k, beta, label in CONFIGS:
+    print(f"{'='*70}")
+    print(f"  {label}")
+    print(f"{'='*70}")
+
+    hip = HippocampalMemory(
+        embed_dim=EMBED_DIM, hopfield_top_k=top_k, beta=beta, device=DEVICE,
+    )
+    for ce, te, cue_text, target_text in mem_embs:
+        hip.store(ce, te, metadata={"cue": cue_text, "target": target_text})
+
+    for qe, qlabel, should_recall in query_embs:
+        results = hip.recall(qe, top_k=5)
+
+        # show distribution
+        sims = [r.similarity for r in results]
+        top1 = sims[0] if sims else 0
+        top2 = sims[1] if len(sims) > 1 else 0
+        gap = top1 - top2  # gap between #1 and #2
+        above_5pct = sum(1 for s in sims if s >= 0.05)
+        above_10pct = sum(1 for s in sims if s >= 0.10)
+
+        top_target = results[0].metadata["target"][:40] if results else "—"
+        tag = "✓" if should_recall else "✗"
+
+        print(f"  [{tag}] {qlabel:10s}  top1={top1:.0%} top2={top2:.0%} gap={gap:.0%}  "
+              f"≥5%:{above_5pct} ≥10%:{above_10pct}  → {top_target}")
+
+    # summary: average sharpness
+    total_gap = 0
+    total_top1 = 0
+    for qe, qlabel, _ in query_embs:
+        results = hip.recall(qe, top_k=5)
+        sims = [r.similarity for r in results]
+        total_top1 += sims[0] if sims else 0
+        total_gap += (sims[0] - sims[1]) if len(sims) > 1 else 0
+
+    n = len(query_embs)
+    print(f"\n  avg top1={total_top1/n:.0%}  avg gap={total_gap/n:.0%}")
+    print()
+
+    del hip
+    torch.cuda.empty_cache()