NuoNuo: Hippocampal memory module prototype

Hopfield + Hebbian hybrid memory system for LLMs. Two nights of experiments (16 iterations), validated on LongMemEval (ICLR 2025). Architecture: - Single-hop: Two-Stage Hopfield (NN top-20 → softmax settle) - Multi-hop: Hebbian W matrix with WTA pattern separation - 64% on LongMemEval (500 questions), retrieval-only, no LLM dependency - 4ms latency @ 20K memories, ~1GB VRAM Key findings: - Hopfield attention solved noise tolerance (20% → 100% vs flat Hebbian) - WTA pattern separation enables 20K+ capacity - Multi-hop associative chains (6 hops, CosSim=1.0) — RAG can't do this - MiniLM-L6 is optimal (discrimination gap > absolute similarity) - Paraphrase cue augmentation: 55% → 100% on synthetic, 36% → 64% on benchmark - SNN encoder viable (CosSim 0.99) but not needed for current architecture
2026-04-07 10:37:24 +01:00
commit d923aa1e31
65 changed files with 13148 additions and 0 deletions
--- a/experiments/exp09_embedding_models.py
+++ b/experiments/exp09_embedding_models.py
@@ -0,0 +1,222 @@
+"""Experiment P1: Better embedding models.
+
+MiniLM (22M) has weak paraphrase similarity for many pairs.
+Test: BGE-small (33M), BGE-base (109M), and E5-small (33M).
+Skip large models (330M+) due to VRAM budget with Hebbian W.
+
+Measure:
+1. Paraphrase pair cosine similarity (gap between same/diff pairs)
+2. Recall accuracy with Hopfield at 2K background
+3. Encoding speed
+"""
+
+import sys
+import time
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+DEVICE = "cuda"
+
+# Test pairs (same as exp07e)
+PAIRS = [
+    ("What's the weather like today?", "User checks weather every morning"),
+    ("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"),
+    ("The database is slow again", "Missing index on users table"),
+    ("I need to fix the authentication bug", "JWT tokens with 24h expiry in Redis"),
+    ("The API returns 500 errors", "OOM in the Python worker"),
+    ("Let's set up monitoring", "Prometheus + Grafana on OCI"),
+    ("Tests failing in CI", "CI needs postgres service container"),
+    ("Memory usage too high", "Leak in websocket handler"),
+    ("Help with Docker setup", "docker-compose for dev, k3s for prod"),
+    ("Log files too large", "Logs rotate daily, shipped to Loki"),
+    ("How to add caching?", "Redis available at redis.internal:6379"),
+    ("Frontend loads slowly", "CDN CloudFlare, 1h TTL for assets"),
+    ("Refactor payment module", "Stripe API, webhook in payments/webhook.py"),
+    ("Set up new server", "Ubuntu 22.04, Docker, Tailscale, monitoring"),
+    ("Optimize search", "Elasticsearch v8, recently upgraded"),
+    ("Backup the database", "Daily 3am UTC cron to S3"),
+    ("Configure reverse proxy", "Traefik, not nginx"),
+    ("Team meeting schedule", "Standup 10am London, Mon-Fri"),
+    ("Learn a new programming language", "User has Python+Go, new to systems"),
+    ("Review my pull request", "User prefers small PRs with clear commits"),
+]
+
+PARAPHRASES = [
+    "How's the weather?", "Ship the release", "DB performance terrible",
+    "Fix the login issue", "Server errors everywhere", "Need observability",
+    "CI tests breaking", "Service using too much RAM", "Docker config help",
+    "Logs eating disk space", "Want to add a cache layer", "Website too slow",
+    "Payment code needs rework", "Provision a new machine", "Search is slow",
+    "Need a DB backup", "Proxy configuration", "When's the standup?",
+    "Want to learn Rust", "Check my pull request",
+]
+
+
+def winner_take_all(x, k):
+    _, idx = x.topk(k, dim=-1)
+    out = torch.zeros_like(x)
+    out.scatter_(-1, idx, 1.0)
+    return out
+
+
+def cosine(a, b):
+    return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()
+
+
+class TwoStageHopfield:
+    def __init__(self, embed_dim, beta=16.0, top_k=20):
+        self.beta = beta
+        self.top_k = top_k
+        self.cue_embs = []
+        self.target_embs = []
+
+    def learn(self, cue_emb, target_emb):
+        self.cue_embs.append(cue_emb.detach())
+        self.target_embs.append(target_emb.detach())
+
+    def recall(self, query_emb, steps=3):
+        cue_mat = torch.stack(self.cue_embs)
+        target_mat = torch.stack(self.target_embs)
+        K = min(self.top_k, len(self.cue_embs))
+        sims = query_emb @ cue_mat.T
+        _, top_idx = sims.topk(K)
+        cand_cues = cue_mat[top_idx]
+        cand_targets = target_mat[top_idx]
+
+        xi = query_emb
+        for _ in range(steps):
+            scores = self.beta * (xi @ cand_cues.T)
+            attn = torch.softmax(scores, dim=0)
+            xi = attn @ cand_cues
+            xi = nn.functional.normalize(xi, dim=0)
+
+        scores = self.beta * (xi @ cand_cues.T)
+        attn = torch.softmax(scores, dim=0)
+        return nn.functional.normalize(attn @ cand_targets, dim=0)
+
+
+def evaluate_model(model_name):
+    """Full evaluation of one embedding model."""
+    from sentence_transformers import SentenceTransformer
+
+    print(f"\n--- {model_name} ---")
+    t0 = time.time()
+    model = SentenceTransformer(model_name, device=DEVICE)
+    load_time = time.time() - t0
+    embed_dim = model.get_sentence_embedding_dimension()
+    print(f"  Dim: {embed_dim}, Load: {load_time:.1f}s")
+
+    # 1. Paraphrase similarity gap
+    cue_texts = [p[0] for p in PAIRS]
+    cue_embs = model.encode(cue_texts, convert_to_tensor=True,
+                             normalize_embeddings=True, device=DEVICE)
+    para_embs = model.encode(PARAPHRASES, convert_to_tensor=True,
+                              normalize_embeddings=True, device=DEVICE)
+    target_embs = model.encode([p[1] for p in PAIRS], convert_to_tensor=True,
+                                normalize_embeddings=True, device=DEVICE)
+
+    same_sims = [cosine(cue_embs[i], para_embs[i]) for i in range(len(PAIRS))]
+    diff_sims = []
+    for i in range(len(PAIRS)):
+        for j in range(len(PAIRS)):
+            if i != j:
+                diff_sims.append(cosine(cue_embs[i], para_embs[j]))
+
+    mean_same = np.mean(same_sims)
+    mean_diff = np.mean(diff_sims)
+    min_same = np.min(same_sims)
+    gap = mean_same - mean_diff
+
+    print(f"  Similarity: same={mean_same:.3f} (min={min_same:.3f}), "
+          f"diff={mean_diff:.3f}, gap={gap:.3f}")
+
+    # Show worst pairs
+    worst_idx = np.argsort(same_sims)[:3]
+    for idx in worst_idx:
+        print(f"    Worst: {same_sims[idx]:.3f} '{cue_texts[idx][:30]}...' ↔ '{PARAPHRASES[idx][:30]}...'")
+
+    # 2. Encoding speed
+    texts_100 = [f"Test sentence number {i} about various topics" for i in range(100)]
+    t0 = time.time()
+    model.encode(texts_100, convert_to_tensor=True, device=DEVICE)
+    speed = 100 / (time.time() - t0)
+    print(f"  Speed: {speed:.0f} sentences/s")
+
+    # 3. Recall with 2K background
+    mem = TwoStageHopfield(embed_dim, beta=16.0, top_k=20)
+    for i in range(len(PAIRS)):
+        mem.learn(cue_embs[i], target_embs[i])
+
+    # Background
+    bg_cues = [f"The {['server','db','api','fe','be','cache'][i%6]} has issue {i}"
+               for i in range(2000)]
+    bg_targets = [f"Fix issue {i}" for i in range(2000)]
+    bg_cue_embs = model.encode(bg_cues, convert_to_tensor=True,
+                                normalize_embeddings=True, device=DEVICE, batch_size=256)
+    bg_target_embs = model.encode(bg_targets, convert_to_tensor=True,
+                                   normalize_embeddings=True, device=DEVICE, batch_size=256)
+    for i in range(2000):
+        mem.learn(bg_cue_embs[i], bg_target_embs[i])
+
+    correct = 0
+    for i in range(len(PARAPHRASES)):
+        recalled = mem.recall(para_embs[i])
+        all_sims = [cosine(recalled, target_embs[j]) for j in range(len(PAIRS))]
+        if np.argmax(all_sims) == i:
+            correct += 1
+
+    n = len(PARAPHRASES)
+    print(f"  Recall (20 pairs + 2K bg): {correct}/{n} ({correct/n:.0%})")
+
+    # VRAM
+    vram = torch.cuda.memory_allocated() / 1024**2
+    print(f"  VRAM: {vram:.0f} MB")
+
+    del model, mem
+    torch.cuda.empty_cache()
+
+    return {
+        "model": model_name, "dim": embed_dim,
+        "same_sim": mean_same, "diff_sim": mean_diff, "gap": gap,
+        "min_same": min_same, "speed": speed,
+        "recall": correct / n, "vram_mb": vram,
+    }
+
+
+def main():
+    print("=" * 60)
+    print("Experiment P1: Embedding Model Comparison")
+    print("=" * 60)
+
+    models = [
+        "all-MiniLM-L6-v2",           # Baseline, 22M, dim=384
+        "BAAI/bge-small-en-v1.5",     # 33M, dim=384
+        "BAAI/bge-base-en-v1.5",      # 109M, dim=768
+        "intfloat/e5-small-v2",       # 33M, dim=384
+    ]
+
+    results = []
+    for model_name in models:
+        try:
+            r = evaluate_model(model_name)
+            results.append(r)
+        except Exception as e:
+            print(f"  ERROR: {e}")
+
+    # Summary table
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print(f"{'Model':<30} {'Dim':>4} {'SameSim':>8} {'Gap':>6} "
+          f"{'MinSim':>7} {'Recall':>7} {'Speed':>6} {'VRAM':>6}")
+    print("-" * 80)
+    for r in results:
+        print(f"{r['model']:<30} {r['dim']:>4} {r['same_sim']:>8.3f} "
+              f"{r['gap']:>6.3f} {r['min_same']:>7.3f} "
+              f"{r['recall']:>6.0%} {r['speed']:>5.0f}/s {r['vram_mb']:>5.0f}MB")
+
+
+if __name__ == "__main__":
+    main()