NuoNuo: Hippocampal memory module prototype
Hopfield + Hebbian hybrid memory system for LLMs. Two nights of experiments (16 iterations), validated on LongMemEval (ICLR 2025). Architecture: - Single-hop: Two-Stage Hopfield (NN top-20 → softmax settle) - Multi-hop: Hebbian W matrix with WTA pattern separation - 64% on LongMemEval (500 questions), retrieval-only, no LLM dependency - 4ms latency @ 20K memories, ~1GB VRAM Key findings: - Hopfield attention solved noise tolerance (20% → 100% vs flat Hebbian) - WTA pattern separation enables 20K+ capacity - Multi-hop associative chains (6 hops, CosSim=1.0) — RAG can't do this - MiniLM-L6 is optimal (discrimination gap > absolute similarity) - Paraphrase cue augmentation: 55% → 100% on synthetic, 36% → 64% on benchmark - SNN encoder viable (CosSim 0.99) but not needed for current architecture
This commit is contained in:
211
experiments/exp05_benchmark.py
Normal file
211
experiments/exp05_benchmark.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""Experiment 5: Performance benchmarks.
|
||||
|
||||
Measure:
|
||||
1. Learning throughput (memories/second)
|
||||
2. Recall latency (ms per query)
|
||||
3. GPU memory usage at different scales
|
||||
4. Multi-hop latency vs hops
|
||||
5. End-to-end: embed + separate + recall pipeline
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
|
||||
DEVICE = "cuda"
|
||||
RESULTS_DIR = Path(__file__).parent.parent / "doc"
|
||||
|
||||
|
||||
def winner_take_all(x, k):
|
||||
_, idx = x.topk(k, dim=-1)
|
||||
out = torch.zeros_like(x)
|
||||
out.scatter_(-1, idx, 1.0)
|
||||
return out
|
||||
|
||||
|
||||
class BenchMemory:
|
||||
def __init__(self, input_dim, code_dim, k):
|
||||
self.k = k
|
||||
self.code_dim = code_dim
|
||||
self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
|
||||
* (1.0 / input_dim**0.5))
|
||||
self.W = torch.zeros(code_dim, code_dim, device=DEVICE)
|
||||
|
||||
def sep(self, x):
|
||||
return winner_take_all(x @ self.proj, self.k)
|
||||
|
||||
def learn(self, cue, target):
|
||||
self.W += torch.outer(self.sep(target), self.sep(cue))
|
||||
|
||||
def recall(self, query, hops=1):
|
||||
code = self.sep(query)
|
||||
for _ in range(hops):
|
||||
code = winner_take_all(self.W @ code, self.k)
|
||||
return code
|
||||
|
||||
|
||||
def benchmark_learn(input_dim, code_dim, k, n_memories):
|
||||
"""Measure learning throughput."""
|
||||
mem = BenchMemory(input_dim, code_dim, k)
|
||||
cues = torch.randn(n_memories, input_dim, device=DEVICE)
|
||||
targets = torch.randn(n_memories, input_dim, device=DEVICE)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.time()
|
||||
for i in range(n_memories):
|
||||
mem.learn(cues[i], targets[i])
|
||||
torch.cuda.synchronize()
|
||||
dt = time.time() - t0
|
||||
|
||||
return n_memories / dt, dt
|
||||
|
||||
|
||||
def benchmark_recall(input_dim, code_dim, k, n_memories, n_queries=1000, hops=1):
|
||||
"""Measure recall latency."""
|
||||
mem = BenchMemory(input_dim, code_dim, k)
|
||||
|
||||
# Pre-fill
|
||||
for _ in range(n_memories):
|
||||
c = torch.randn(input_dim, device=DEVICE)
|
||||
t = torch.randn(input_dim, device=DEVICE)
|
||||
mem.learn(c, t)
|
||||
|
||||
queries = torch.randn(n_queries, input_dim, device=DEVICE)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.time()
|
||||
for i in range(n_queries):
|
||||
mem.recall(queries[i], hops=hops)
|
||||
torch.cuda.synchronize()
|
||||
dt = time.time() - t0
|
||||
|
||||
return dt / n_queries * 1000 # ms per query
|
||||
|
||||
|
||||
def benchmark_memory_usage(input_dim, code_dims):
|
||||
"""Measure GPU memory at different code_dim."""
|
||||
results = {}
|
||||
for cd in code_dims:
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
before = torch.cuda.memory_allocated()
|
||||
mem = BenchMemory(input_dim, cd, k=50)
|
||||
# Learn 1000 memories
|
||||
for _ in range(1000):
|
||||
c = torch.randn(input_dim, device=DEVICE)
|
||||
t = torch.randn(input_dim, device=DEVICE)
|
||||
mem.learn(c, t)
|
||||
|
||||
after = torch.cuda.memory_allocated()
|
||||
peak = torch.cuda.max_memory_allocated()
|
||||
|
||||
w_size = cd * cd * 4 / 1024**2 # MB
|
||||
proj_size = input_dim * cd * 4 / 1024**2 # MB
|
||||
total_allocated = (after - before) / 1024**2
|
||||
|
||||
results[cd] = {
|
||||
"W_size_MB": w_size,
|
||||
"proj_size_MB": proj_size,
|
||||
"total_allocated_MB": total_allocated,
|
||||
"peak_MB": peak / 1024**2,
|
||||
}
|
||||
print(f" code_dim={cd:>6}: W={w_size:.0f}MB, proj={proj_size:.0f}MB, "
|
||||
f"total={total_allocated:.0f}MB")
|
||||
|
||||
del mem
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Experiment 5: Performance Benchmarks")
|
||||
print("=" * 60)
|
||||
|
||||
input_dim = 384 # MiniLM dimension
|
||||
|
||||
# Test 1: Learning throughput
|
||||
print("\n=== Learning Throughput ===")
|
||||
for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]:
|
||||
for n in [1000, 5000, 10000]:
|
||||
rate, dt = benchmark_learn(input_dim, code_dim, k, n)
|
||||
print(f" code={code_dim}, k={k}, N={n:>5}: "
|
||||
f"{rate:>8.0f} memories/s ({dt:.2f}s)")
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Test 2: Recall latency
|
||||
print("\n=== Recall Latency ===")
|
||||
for code_dim, k in [(8192, 50), (16384, 50), (32768, 50)]:
|
||||
for n_mem in [100, 1000, 10000]:
|
||||
ms = benchmark_recall(input_dim, code_dim, k, n_mem, n_queries=1000)
|
||||
print(f" code={code_dim}, k={k}, N={n_mem:>5}: {ms:.3f} ms/query")
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Test 3: Multi-hop latency
|
||||
print("\n=== Multi-hop Latency ===")
|
||||
for hops in [1, 2, 3, 5, 10]:
|
||||
ms = benchmark_recall(input_dim, 16384, 50, 1000, n_queries=1000, hops=hops)
|
||||
print(f" hops={hops:>2}: {ms:.3f} ms/query")
|
||||
|
||||
# Test 4: GPU Memory
|
||||
print("\n=== GPU Memory Usage ===")
|
||||
benchmark_memory_usage(input_dim, [4096, 8192, 16384, 32768, 65536])
|
||||
|
||||
# Test 5: End-to-end with sentence-transformers
|
||||
print("\n=== End-to-End Pipeline Latency ===")
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
|
||||
|
||||
mem = BenchMemory(384, 16384, 50)
|
||||
# Pre-fill 1000 memories
|
||||
sentences = [f"This is test sentence number {i}" for i in range(1000)]
|
||||
embs = model.encode(sentences, convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE)
|
||||
for i in range(1000):
|
||||
mem.learn(embs[i], embs[min(i+1, 999)])
|
||||
|
||||
# Benchmark single query pipeline
|
||||
query = "What is the test sentence?"
|
||||
n_runs = 100
|
||||
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.time()
|
||||
for _ in range(n_runs):
|
||||
q_emb = model.encode([query], convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE)[0]
|
||||
recalled = mem.recall(q_emb, hops=1)
|
||||
torch.cuda.synchronize()
|
||||
dt = (time.time() - t0) / n_runs * 1000
|
||||
|
||||
# Breakdown
|
||||
t_embed = 0
|
||||
t_recall = 0
|
||||
for _ in range(n_runs):
|
||||
torch.cuda.synchronize()
|
||||
t1 = time.time()
|
||||
q_emb = model.encode([query], convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE)[0]
|
||||
torch.cuda.synchronize()
|
||||
t2 = time.time()
|
||||
recalled = mem.recall(q_emb, hops=1)
|
||||
torch.cuda.synchronize()
|
||||
t3 = time.time()
|
||||
t_embed += t2 - t1
|
||||
t_recall += t3 - t2
|
||||
|
||||
t_embed = t_embed / n_runs * 1000
|
||||
t_recall = t_recall / n_runs * 1000
|
||||
|
||||
print(f" Total: {dt:.1f} ms/query")
|
||||
print(f" Embedding: {t_embed:.1f} ms")
|
||||
print(f" Recall: {t_recall:.3f} ms")
|
||||
print(f" Ratio: embedding is {t_embed/t_recall:.0f}x slower than recall")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user