NuoNuo: Hippocampal memory module prototype

Hopfield + Hebbian hybrid memory system for LLMs. Two nights of experiments (16 iterations), validated on LongMemEval (ICLR 2025). Architecture: - Single-hop: Two-Stage Hopfield (NN top-20 → softmax settle) - Multi-hop: Hebbian W matrix with WTA pattern separation - 64% on LongMemEval (500 questions), retrieval-only, no LLM dependency - 4ms latency @ 20K memories, ~1GB VRAM Key findings: - Hopfield attention solved noise tolerance (20% → 100% vs flat Hebbian) - WTA pattern separation enables 20K+ capacity - Multi-hop associative chains (6 hops, CosSim=1.0) — RAG can't do this - MiniLM-L6 is optimal (discrimination gap > absolute similarity) - Paraphrase cue augmentation: 55% → 100% on synthetic, 36% → 64% on benchmark - SNN encoder viable (CosSim 0.99) but not needed for current architecture
2026-04-07 10:37:24 +01:00
commit d923aa1e31
65 changed files with 13148 additions and 0 deletions
--- a/experiments/exp11_scale_ceiling.py
+++ b/experiments/exp11_scale_ceiling.py
@@ -0,0 +1,237 @@
+"""Experiment P3: Breaking the 20K 80% ceiling.
+
+Hypothesis: NN pre-filter (top-20) misses the correct cue at large scale.
+
+Tests:
+1. Oracle analysis: is the correct cue in top-K? What K is needed?
+2. Hierarchical memory: cluster memories, route query to relevant cluster
+3. Re-ranking: top-K NN → cross-similarity re-rank → Hopfield on re-ranked
+4. Multiple projections: ensemble of NN lookups with different random projections
+"""
+
+import sys
+import time
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+DEVICE = "cuda"
+
+PAIRS = [
+    ("What's the weather like today?", "User checks weather every morning"),
+    ("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"),
+    ("The database is slow again", "Missing index on users table"),
+    ("I need to fix the authentication bug", "JWT tokens with 24h expiry in Redis"),
+    ("The API returns 500 errors", "OOM in the Python worker"),
+    ("Let's set up monitoring", "Prometheus + Grafana on OCI"),
+    ("Tests failing in CI", "CI needs postgres service container"),
+    ("Memory usage too high", "Leak in websocket handler"),
+    ("Help with Docker setup", "docker-compose for dev, k3s for prod"),
+    ("Log files too large", "Logs rotate daily, shipped to Loki"),
+]
+
+PARAPHRASES = [
+    "How's the weather?", "Ship the release", "DB performance terrible",
+    "Fix the login issue", "Server errors everywhere", "Need observability",
+    "CI tests breaking", "Service using too much RAM", "Docker config help",
+    "Logs eating disk space",
+]
+
+
+def cosine(a, b):
+    return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()
+
+
+def load_model():
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
+
+
+def build_memory(model, n_bg):
+    """Build memory with test pairs + background."""
+    cue_embs = model.encode([p[0] for p in PAIRS], convert_to_tensor=True,
+                             normalize_embeddings=True, device=DEVICE)
+    target_embs = model.encode([p[1] for p in PAIRS], convert_to_tensor=True,
+                                normalize_embeddings=True, device=DEVICE)
+    para_embs = model.encode(PARAPHRASES, convert_to_tensor=True,
+                              normalize_embeddings=True, device=DEVICE)
+
+    all_cues = list(cue_embs)
+    all_targets = list(target_embs)
+    all_mids = list(range(len(PAIRS)))
+
+    if n_bg > 0:
+        topics = ["server", "db", "api", "fe", "be", "cache",
+                  "queue", "net", "store", "auth", "docker", "k8s"]
+        bg_cues = [f"The {topics[i%len(topics)]} has issue {i}" for i in range(n_bg)]
+        bg_targets = [f"Fix {topics[i%len(topics)]} issue {i}" for i in range(n_bg)]
+        bg_c = model.encode(bg_cues, convert_to_tensor=True,
+                             normalize_embeddings=True, device=DEVICE, batch_size=256)
+        bg_t = model.encode(bg_targets, convert_to_tensor=True,
+                             normalize_embeddings=True, device=DEVICE, batch_size=256)
+        for i in range(n_bg):
+            all_cues.append(bg_c[i])
+            all_targets.append(bg_t[i])
+            all_mids.append(100 + i)
+
+    cue_mat = torch.stack(all_cues)
+    target_mat = torch.stack(all_targets)
+    return cue_mat, target_mat, all_mids, cue_embs, target_embs, para_embs
+
+
+def test_topk_coverage(model, n_bg_list):
+    """Is the correct cue in top-K? What K do we need?"""
+    print("=== Test 1: Top-K Coverage Analysis ===\n")
+
+    for n_bg in n_bg_list:
+        cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)
+
+        for K in [5, 10, 20, 50, 100, 200]:
+            in_topk = 0
+            for i in range(len(PARAPHRASES)):
+                sims = para_embs[i] @ cue_mat.T
+                _, top_idx = sims.topk(min(K, len(mids)))
+                top_mids = [mids[j] for j in top_idx.tolist()]
+                if i in top_mids:
+                    in_topk += 1
+
+            n = len(PARAPHRASES)
+            print(f"  N={n_bg+len(PAIRS):>6}, K={K:>3}: "
+                  f"{in_topk}/{n} ({in_topk/n:.0%}) correct cue in top-K")
+        print()
+
+
+def test_two_stage_topk(model, n_bg):
+    """Vary K in two-stage Hopfield to find optimal."""
+    print(f"\n=== Test 2: Two-Stage K Optimization (bg={n_bg}) ===\n")
+
+    cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)
+
+    for K in [5, 10, 20, 50, 100, 200]:
+        correct = 0
+        for i in range(len(PARAPHRASES)):
+            sims = para_embs[i] @ cue_mat.T
+            k = min(K, len(mids))
+            _, top_idx = sims.topk(k)
+            cand_cues = cue_mat[top_idx]
+            cand_targets = target_mat[top_idx]
+            cand_mids = [mids[j] for j in top_idx.tolist()]
+
+            # Hopfield settle
+            xi = para_embs[i]
+            for _ in range(3):
+                scores = 16.0 * (xi @ cand_cues.T)
+                attn = torch.softmax(scores, dim=0)
+                xi = attn @ cand_cues
+                xi = nn.functional.normalize(xi, dim=0)
+
+            scores = 16.0 * (xi @ cand_cues.T)
+            attn = torch.softmax(scores, dim=0)
+
+            mid_scores = {}
+            for j, mid in enumerate(cand_mids):
+                mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item()
+
+            best_mid = max(mid_scores, key=mid_scores.get)
+            if best_mid == i:
+                correct += 1
+
+        n = len(PARAPHRASES)
+        print(f"  K={K:>3}: {correct}/{n} ({correct/n:.0%})")
+
+
+def test_hierarchical(model, n_bg):
+    """Cluster memories by topic, route query to relevant cluster."""
+    print(f"\n=== Test 3: Hierarchical Memory (bg={n_bg}) ===\n")
+
+    cue_mat, target_mat, mids, cue_embs, target_embs, para_embs = build_memory(model, n_bg)
+
+    # Simple clustering: k-means on cue embeddings
+    from torch import cdist
+    n_clusters = max(10, (n_bg + len(PAIRS)) // 100)
+
+    # K-means (simple implementation)
+    N = cue_mat.shape[0]
+    centroids = cue_mat[torch.randperm(N)[:n_clusters]].clone()
+
+    for _ in range(20):
+        dists = 1 - cue_mat @ centroids.T  # cosine distance
+        assignments = dists.argmin(dim=1)
+        for c in range(n_clusters):
+            mask = assignments == c
+            if mask.sum() > 0:
+                centroids[c] = nn.functional.normalize(cue_mat[mask].mean(dim=0), dim=0)
+
+    # Route query to top-3 clusters, then Hopfield within
+    correct = 0
+    for i in range(len(PARAPHRASES)):
+        # Find relevant clusters
+        cluster_sims = para_embs[i] @ centroids.T
+        top_clusters = cluster_sims.topk(3).indices
+
+        # Gather candidates from top clusters
+        cand_idx = []
+        for c in top_clusters:
+            cluster_members = (assignments == c).nonzero().squeeze(-1).tolist()
+            cand_idx.extend(cluster_members)
+        cand_idx = list(set(cand_idx))
+
+        if not cand_idx:
+            continue
+
+        # Hopfield on candidates
+        cand_cues = cue_mat[cand_idx]
+        cand_targets = target_mat[cand_idx]
+        cand_mids = [mids[j] for j in cand_idx]
+
+        K = min(20, len(cand_idx))
+        sims = para_embs[i] @ cand_cues.T
+        _, top_local = sims.topk(K)
+
+        local_cues = cand_cues[top_local]
+        local_mids = [cand_mids[j] for j in top_local.tolist()]
+
+        xi = para_embs[i]
+        for _ in range(3):
+            scores = 16.0 * (xi @ local_cues.T)
+            attn = torch.softmax(scores, dim=0)
+            xi = attn @ local_cues
+            xi = nn.functional.normalize(xi, dim=0)
+
+        scores = 16.0 * (xi @ local_cues.T)
+        attn = torch.softmax(scores, dim=0)
+        mid_scores = {}
+        for j, mid in enumerate(local_mids):
+            mid_scores[mid] = mid_scores.get(mid, 0) + attn[j].item()
+
+        best_mid = max(mid_scores, key=mid_scores.get)
+        if best_mid == i:
+            correct += 1
+
+    n = len(PARAPHRASES)
+    print(f"  Hierarchical (clusters={n_clusters}): {correct}/{n} ({correct/n:.0%})")
+
+
+def main():
+    print("=" * 60)
+    print("Experiment P3: Breaking the 20K Ceiling")
+    print("=" * 60)
+
+    model = load_model()
+
+    # Test 1: Top-K coverage
+    test_topk_coverage(model, [0, 500, 2000, 5000, 10000, 20000])
+
+    # Test 2: K optimization
+    for bg in [2000, 10000, 20000]:
+        test_two_stage_topk(model, bg)
+
+    # Test 3: Hierarchical
+    for bg in [2000, 10000, 20000]:
+        test_hierarchical(model, bg)
+
+
+if __name__ == "__main__":
+    main()