NuoNuo: Hippocampal memory module prototype

Hopfield + Hebbian hybrid memory system for LLMs. Two nights of experiments (16 iterations), validated on LongMemEval (ICLR 2025). Architecture: - Single-hop: Two-Stage Hopfield (NN top-20 → softmax settle) - Multi-hop: Hebbian W matrix with WTA pattern separation - 64% on LongMemEval (500 questions), retrieval-only, no LLM dependency - 4ms latency @ 20K memories, ~1GB VRAM Key findings: - Hopfield attention solved noise tolerance (20% → 100% vs flat Hebbian) - WTA pattern separation enables 20K+ capacity - Multi-hop associative chains (6 hops, CosSim=1.0) — RAG can't do this - MiniLM-L6 is optimal (discrimination gap > absolute similarity) - Paraphrase cue augmentation: 55% → 100% on synthetic, 36% → 64% on benchmark - SNN encoder viable (CosSim 0.99) but not needed for current architecture
2026-04-07 10:37:24 +01:00
commit d923aa1e31
65 changed files with 13148 additions and 0 deletions
--- a/experiments/exp05_benchmark.py
+++ b/experiments/exp05_benchmark.py
@@ -0,0 +1,211 @@
+"""Experiment 5: Performance benchmarks.
+
+Measure:
+1. Learning throughput (memories/second)
+2. Recall latency (ms per query)
+3. GPU memory usage at different scales
+4. Multi-hop latency vs hops
+5. End-to-end: embed + separate + recall pipeline
+"""
+
+import sys
+import time
+import json
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+DEVICE = "cuda"
+RESULTS_DIR = Path(__file__).parent.parent / "doc"
+
+
+def winner_take_all(x, k):
+    _, idx = x.topk(k, dim=-1)
+    out = torch.zeros_like(x)
+    out.scatter_(-1, idx, 1.0)
+    return out
+
+
+class BenchMemory:
+    def __init__(self, input_dim, code_dim, k):
+        self.k = k
+        self.code_dim = code_dim
+        self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
+                     * (1.0 / input_dim**0.5))
+        self.W = torch.zeros(code_dim, code_dim, device=DEVICE)
+
+    def sep(self, x):
+        return winner_take_all(x @ self.proj, self.k)
+
+    def learn(self, cue, target):
+        self.W += torch.outer(self.sep(target), self.sep(cue))
+
+    def recall(self, query, hops=1):
+        code = self.sep(query)
+        for _ in range(hops):
+            code = winner_take_all(self.W @ code, self.k)
+        return code
+
+
+def benchmark_learn(input_dim, code_dim, k, n_memories):
+    """Measure learning throughput."""
+    mem = BenchMemory(input_dim, code_dim, k)
+    cues = torch.randn(n_memories, input_dim, device=DEVICE)
+    targets = torch.randn(n_memories, input_dim, device=DEVICE)
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for i in range(n_memories):
+        mem.learn(cues[i], targets[i])
+    torch.cuda.synchronize()
+    dt = time.time() - t0
+
+    return n_memories / dt, dt
+
+
+def benchmark_recall(input_dim, code_dim, k, n_memories, n_queries=1000, hops=1):
+    """Measure recall latency."""
+    mem = BenchMemory(input_dim, code_dim, k)
+
+    # Pre-fill
+    for _ in range(n_memories):
+        c = torch.randn(input_dim, device=DEVICE)
+        t = torch.randn(input_dim, device=DEVICE)
+        mem.learn(c, t)
+
+    queries = torch.randn(n_queries, input_dim, device=DEVICE)
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for i in range(n_queries):
+        mem.recall(queries[i], hops=hops)
+    torch.cuda.synchronize()
+    dt = time.time() - t0
+
+    return dt / n_queries * 1000  # ms per query
+
+
+def benchmark_memory_usage(input_dim, code_dims):
+    """Measure GPU memory at different code_dim."""
+    results = {}
+    for cd in code_dims:
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+        before = torch.cuda.memory_allocated()
+        mem = BenchMemory(input_dim, cd, k=50)
+        # Learn 1000 memories
+        for _ in range(1000):
+            c = torch.randn(input_dim, device=DEVICE)
+            t = torch.randn(input_dim, device=DEVICE)
+            mem.learn(c, t)
+
+        after = torch.cuda.memory_allocated()
+        peak = torch.cuda.max_memory_allocated()
+
+        w_size = cd * cd * 4 / 1024**2  # MB
+        proj_size = input_dim * cd * 4 / 1024**2  # MB
+        total_allocated = (after - before) / 1024**2
+
+        results[cd] = {
+            "W_size_MB": w_size,
+            "proj_size_MB": proj_size,
+            "total_allocated_MB": total_allocated,
+            "peak_MB": peak / 1024**2,
+        }
+        print(f"  code_dim={cd:>6}: W={w_size:.0f}MB, proj={proj_size:.0f}MB, "
+              f"total={total_allocated:.0f}MB")
+
+        del mem
+    return results
+
+
+def main():
+    print("=" * 60)
+    print("Experiment 5: Performance Benchmarks")
+    print("=" * 60)
+
+    input_dim = 384  # MiniLM dimension
+
+    # Test 1: Learning throughput
+    print("\n=== Learning Throughput ===")
+    for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]:
+        for n in [1000, 5000, 10000]:
+            rate, dt = benchmark_learn(input_dim, code_dim, k, n)
+            print(f"  code={code_dim}, k={k}, N={n:>5}: "
+                  f"{rate:>8.0f} memories/s ({dt:.2f}s)")
+        torch.cuda.empty_cache()
+
+    # Test 2: Recall latency
+    print("\n=== Recall Latency ===")
+    for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]:
+        for n_mem in [100, 1000, 10000]:
+            ms = benchmark_recall(input_dim, code_dim, k, n_mem, n_queries=1000)
+            print(f"  code={code_dim}, k={k}, N={n_mem:>5}: {ms:.3f} ms/query")
+        torch.cuda.empty_cache()
+
+    # Test 3: Multi-hop latency
+    print("\n=== Multi-hop Latency ===")
+    for hops in [1, 2, 3, 5, 10]:
+        ms = benchmark_recall(input_dim, 16384, 50, 1000, n_queries=1000, hops=hops)
+        print(f"  hops={hops:>2}: {ms:.3f} ms/query")
+
+    # Test 4: GPU Memory
+    print("\n=== GPU Memory Usage ===")
+    benchmark_memory_usage(input_dim, [4096, 8192, 16384, 32768, 65536])
+
+    # Test 5: End-to-end with sentence-transformers
+    print("\n=== End-to-End Pipeline Latency ===")
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
+
+    mem = BenchMemory(384, 16384, 50)
+    # Pre-fill 1000 memories
+    sentences = [f"This is test sentence number {i}" for i in range(1000)]
+    embs = model.encode(sentences, convert_to_tensor=True,
+                        normalize_embeddings=True, device=DEVICE)
+    for i in range(1000):
+        mem.learn(embs[i], embs[min(i+1, 999)])
+
+    # Benchmark single query pipeline
+    query = "What is the test sentence?"
+    n_runs = 100
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for _ in range(n_runs):
+        q_emb = model.encode([query], convert_to_tensor=True,
+                              normalize_embeddings=True, device=DEVICE)[0]
+        recalled = mem.recall(q_emb, hops=1)
+    torch.cuda.synchronize()
+    dt = (time.time() - t0) / n_runs * 1000
+
+    # Breakdown
+    t_embed = 0
+    t_recall = 0
+    for _ in range(n_runs):
+        torch.cuda.synchronize()
+        t1 = time.time()
+        q_emb = model.encode([query], convert_to_tensor=True,
+                              normalize_embeddings=True, device=DEVICE)[0]
+        torch.cuda.synchronize()
+        t2 = time.time()
+        recalled = mem.recall(q_emb, hops=1)
+        torch.cuda.synchronize()
+        t3 = time.time()
+        t_embed += t2 - t1
+        t_recall += t3 - t2
+
+    t_embed = t_embed / n_runs * 1000
+    t_recall = t_recall / n_runs * 1000
+
+    print(f"  Total: {dt:.1f} ms/query")
+    print(f"    Embedding: {t_embed:.1f} ms")
+    print(f"    Recall: {t_recall:.3f} ms")
+    print(f"    Ratio: embedding is {t_embed/t_recall:.0f}x slower than recall")
+
+
+if __name__ == "__main__":
+    main()