NuoNuo: Hippocampal memory module prototype
Hopfield + Hebbian hybrid memory system for LLMs. Two nights of experiments (16 iterations), validated on LongMemEval (ICLR 2025). Architecture: - Single-hop: Two-Stage Hopfield (NN top-20 → softmax settle) - Multi-hop: Hebbian W matrix with WTA pattern separation - 64% on LongMemEval (500 questions), retrieval-only, no LLM dependency - 4ms latency @ 20K memories, ~1GB VRAM Key findings: - Hopfield attention solved noise tolerance (20% → 100% vs flat Hebbian) - WTA pattern separation enables 20K+ capacity - Multi-hop associative chains (6 hops, CosSim=1.0) — RAG can't do this - MiniLM-L6 is optimal (discrimination gap > absolute similarity) - Paraphrase cue augmentation: 55% → 100% on synthetic, 36% → 64% on benchmark - SNN encoder viable (CosSim 0.99) but not needed for current architecture
This commit is contained in:
335
experiments/exp07_attractor.py
Normal file
335
experiments/exp07_attractor.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Experiment 7: Attractor dynamics for noise-tolerant recall.
|
||||
|
||||
Current architecture: heteroassociative, one-shot (W @ cue → target)
|
||||
Problem: noisy cue → noisy recall, no error correction
|
||||
|
||||
Fix: Use attractor dynamics (like real CA3 recurrent network).
|
||||
|
||||
Approach 1: Autoassociative + heteroassociative
|
||||
- Store patterns as attractors: W_auto += outer(pattern, pattern)
|
||||
- Noisy cue → iterate W_auto until convergence → clean cue
|
||||
- Then: W_hetero @ clean_cue → target
|
||||
|
||||
Approach 2: Recurrent settling with inhibition
|
||||
- W stores associations
|
||||
- Recall: iterate (W @ code → WTA → W @ code → ...) with lateral inhibition
|
||||
- Network settles into clean attractor state
|
||||
|
||||
Approach 3: Modern Hopfield (softmax energy)
|
||||
- Replace linear W @ x with softmax-based attention over stored patterns
|
||||
- Exponential storage capacity, natural noise tolerance
|
||||
|
||||
Approach 4: Hebbian + recurrent cleanup with learned inhibition
|
||||
- W for associations + lateral inhibition matrix for competition
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
|
||||
DEVICE = "cuda"
|
||||
|
||||
|
||||
def cosine(a, b):
|
||||
if a.norm() == 0 or b.norm() == 0:
|
||||
return 0.0
|
||||
return nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()
|
||||
|
||||
|
||||
def winner_take_all(x, k):
|
||||
_, idx = x.topk(k, dim=-1)
|
||||
out = torch.zeros_like(x)
|
||||
out.scatter_(-1, idx, 1.0)
|
||||
return out
|
||||
|
||||
|
||||
# ===== Approach 1: Autoassociative cleanup + heteroassociative recall =====
|
||||
|
||||
class AttractorMemory:
|
||||
"""Two-stage recall: first clean the cue, then associate.
|
||||
|
||||
W_auto: autoassociative (cue → cue), stores cue patterns as attractors
|
||||
W_hetero: heteroassociative (cue <20><><EFBFBD> target), stores associations
|
||||
|
||||
Recall: noisy_cue → settle in W_auto → clean_cue → W_hetero → target
|
||||
"""
|
||||
def __init__(self, input_dim, code_dim=16384, k=50):
|
||||
self.k = k
|
||||
self.code_dim = code_dim
|
||||
self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
|
||||
* (1.0 / input_dim**0.5))
|
||||
# Autoassociative: cue cleanup network
|
||||
self.W_auto = torch.zeros(code_dim, code_dim, device=DEVICE)
|
||||
# Heteroassociative: cue → target
|
||||
self.W_hetero = torch.zeros(code_dim, code_dim, device=DEVICE)
|
||||
|
||||
def sep(self, x):
|
||||
return winner_take_all(x @ self.proj, self.k)
|
||||
|
||||
def learn(self, cue_emb, target_emb):
|
||||
cc = self.sep(cue_emb)
|
||||
tc = self.sep(target_emb)
|
||||
# Auto: store cue as attractor
|
||||
self.W_auto += torch.outer(cc, cc)
|
||||
# Hetero: cue → target
|
||||
self.W_hetero += torch.outer(tc, cc)
|
||||
|
||||
def settle(self, code, W, steps=10):
|
||||
"""Iterate until convergence (attractor dynamics)."""
|
||||
for _ in range(steps):
|
||||
raw = W @ code
|
||||
new_code = winner_take_all(raw, self.k)
|
||||
if (new_code == code).all():
|
||||
break # Converged
|
||||
code = new_code
|
||||
return code
|
||||
|
||||
def recall(self, query_emb, settle_steps=10):
|
||||
"""Noisy query → auto-settle → hetero-associate."""
|
||||
# Encode
|
||||
code = self.sep(query_emb)
|
||||
# Phase 1: Settle in autoassociative network (cleanup)
|
||||
clean_code = self.settle(code, self.W_auto, steps=settle_steps)
|
||||
# Phase 2: Associate
|
||||
raw = self.W_hetero @ clean_code
|
||||
return winner_take_all(raw, self.k)
|
||||
|
||||
def recall_no_settle(self, query_emb):
|
||||
"""Direct recall without settling (baseline)."""
|
||||
code = self.sep(query_emb)
|
||||
raw = self.W_hetero @ code
|
||||
return winner_take_all(raw, self.k)
|
||||
|
||||
|
||||
# ===== Approach 2: Modern Hopfield-inspired attention =====
|
||||
|
||||
class HopfieldMemory:
|
||||
"""Modern Hopfield network: attention over stored patterns.
|
||||
|
||||
Instead of W @ query (linear), use:
|
||||
softmax(beta * query @ stored_patterns^T) @ stored_targets
|
||||
|
||||
This gives exponential capacity and natural noise tolerance.
|
||||
Still uses WTA codes for compatibility with Hebbian multi-hop.
|
||||
"""
|
||||
def __init__(self, input_dim, code_dim=16384, k=50, beta=8.0):
|
||||
self.k = k
|
||||
self.code_dim = code_dim
|
||||
self.beta = beta
|
||||
self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
|
||||
* (1.0 / input_dim**0.5))
|
||||
self.stored_cue_codes = []
|
||||
self.stored_target_codes = []
|
||||
|
||||
def sep(self, x):
|
||||
return winner_take_all(x @ self.proj, self.k)
|
||||
|
||||
def learn(self, cue_emb, target_emb):
|
||||
self.stored_cue_codes.append(self.sep(cue_emb))
|
||||
self.stored_target_codes.append(self.sep(target_emb))
|
||||
|
||||
def recall(self, query_emb, steps=3):
|
||||
"""Hopfield retrieval: iterative attention over stored patterns."""
|
||||
if not self.stored_cue_codes:
|
||||
return torch.zeros(self.code_dim, device=DEVICE)
|
||||
|
||||
cue_matrix = torch.stack(self.stored_cue_codes) # [N, code_dim]
|
||||
target_matrix = torch.stack(self.stored_target_codes)
|
||||
|
||||
xi = self.sep(query_emb) # [code_dim]
|
||||
|
||||
for _ in range(steps):
|
||||
# Attention weights
|
||||
scores = self.beta * (xi @ cue_matrix.T) # [N]
|
||||
attn = torch.softmax(scores, dim=0) # [N]
|
||||
# Weighted sum of stored cue patterns (settle to nearest)
|
||||
xi = attn @ cue_matrix # [code_dim]
|
||||
xi = winner_take_all(xi, self.k)
|
||||
|
||||
# Final: associate to target
|
||||
scores = self.beta * (xi @ cue_matrix.T)
|
||||
attn = torch.softmax(scores, dim=0)
|
||||
recalled = attn @ target_matrix
|
||||
return winner_take_all(recalled, self.k)
|
||||
|
||||
|
||||
# ===== Approach 3: Recurrent Hebbian with lateral inhibition =====
|
||||
|
||||
class RecurrentHebbianMemory:
|
||||
"""Hebbian W + lateral inhibition for competitive recall.
|
||||
|
||||
During settling, neurons compete: strongly activated patterns
|
||||
suppress weakly activated ones via inhibition.
|
||||
"""
|
||||
def __init__(self, input_dim, code_dim=16384, k=50, inhibition=0.1):
|
||||
self.k = k
|
||||
self.code_dim = code_dim
|
||||
self.inhibition = inhibition
|
||||
self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
|
||||
* (1.0 / input_dim**0.5))
|
||||
self.W = torch.zeros(code_dim, code_dim, device=DEVICE)
|
||||
|
||||
def sep(self, x):
|
||||
return winner_take_all(x @ self.proj, self.k)
|
||||
|
||||
def learn(self, cue_emb, target_emb):
|
||||
cc = self.sep(cue_emb)
|
||||
tc = self.sep(target_emb)
|
||||
self.W += torch.outer(tc, cc)
|
||||
# Also store cue as auto-attractor (for settling)
|
||||
self.W += torch.outer(cc, cc) * 0.5
|
||||
|
||||
def recall(self, query_emb, steps=5):
|
||||
code = self.sep(query_emb)
|
||||
for _ in range(steps):
|
||||
# Excitation from W
|
||||
excitation = self.W @ code
|
||||
# Global inhibition: subtract mean activity
|
||||
inhibition = excitation.mean() * self.inhibition
|
||||
activation = excitation - inhibition
|
||||
# WTA: winner suppresses losers
|
||||
code = winner_take_all(activation, self.k)
|
||||
return code
|
||||
|
||||
|
||||
# ===== Test harness =====
|
||||
|
||||
def build_and_test(MemClass, model, n_test_pairs=10, n_background=0,
|
||||
label="", **kwargs):
|
||||
"""Unified test for all memory architectures."""
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
pairs = [
|
||||
("What's the weather like today?", "User checks weather every morning"),
|
||||
("Let's deploy the new version", "Deployment uses GitHub Actions with k3s"),
|
||||
("The database is slow again", "Missing index on users table"),
|
||||
("I need to fix the auth bug", "JWT tokens with 24h expiry in Redis"),
|
||||
("The API returns 500 errors", "OOM in the Python worker"),
|
||||
("Let's set up monitoring", "Prometheus + Grafana on OCI cluster"),
|
||||
("Tests are failing in CI", "CI needs postgres service container"),
|
||||
("Memory usage is too high", "Leak in websocket handler"),
|
||||
("Help with Docker setup", "docker-compose for dev, k3s for prod"),
|
||||
("Log files are too large", "Logs rotate daily, shipped to Loki"),
|
||||
][:n_test_pairs]
|
||||
|
||||
paraphrases = [
|
||||
"How's the weather outside?",
|
||||
"We should push the new release",
|
||||
"DB performance is terrible",
|
||||
"There's a login bug to fix",
|
||||
"Getting internal server errors",
|
||||
"We need better observability",
|
||||
"CI tests keep breaking",
|
||||
"Service using too much RAM",
|
||||
"Docker configuration help",
|
||||
"Logs eating up disk space",
|
||||
][:n_test_pairs]
|
||||
|
||||
embed_dim = model.get_sentence_embedding_dimension()
|
||||
mem = MemClass(embed_dim, **kwargs)
|
||||
|
||||
# Store test memories
|
||||
cue_embs = model.encode([p[0] for p in pairs], convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE)
|
||||
target_embs = model.encode([p[1] for p in pairs], convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE)
|
||||
for i in range(len(pairs)):
|
||||
mem.learn(cue_embs[i], target_embs[i])
|
||||
|
||||
# Store background noise
|
||||
if n_background > 0:
|
||||
bg_cues = [f"Background task {i} about topic {i%20}" for i in range(n_background)]
|
||||
bg_targets = [f"Background fact {i} detail {i%10}" for i in range(n_background)]
|
||||
bg_cue_embs = model.encode(bg_cues, convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE, batch_size=256)
|
||||
bg_target_embs = model.encode(bg_targets, convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE, batch_size=256)
|
||||
for i in range(n_background):
|
||||
mem.learn(bg_cue_embs[i], bg_target_embs[i])
|
||||
|
||||
# Test
|
||||
target_codes = torch.stack([mem.sep(t) for t in target_embs])
|
||||
para_embs = model.encode(paraphrases, convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE)
|
||||
|
||||
exact_correct = 0
|
||||
para_correct = 0
|
||||
|
||||
for i in range(len(pairs)):
|
||||
# Exact
|
||||
recalled = mem.recall(cue_embs[i])
|
||||
sims = nn.functional.cosine_similarity(recalled.unsqueeze(0), target_codes, dim=-1)
|
||||
if sims.argmax().item() == i:
|
||||
exact_correct += 1
|
||||
|
||||
# Paraphrase
|
||||
recalled_p = mem.recall(para_embs[i])
|
||||
sims_p = nn.functional.cosine_similarity(recalled_p.unsqueeze(0), target_codes, dim=-1)
|
||||
if sims_p.argmax().item() == i:
|
||||
para_correct += 1
|
||||
|
||||
n = len(pairs)
|
||||
print(f" {label} (bg={n_background}): "
|
||||
f"Exact={exact_correct}/{n} ({exact_correct/n:.0%}), "
|
||||
f"Para={para_correct}/{n} ({para_correct/n:.0%})")
|
||||
return exact_correct / n, para_correct / n
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Experiment 7: Attractor Dynamics")
|
||||
print("=" * 60)
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
|
||||
|
||||
configs = [
|
||||
("Flat Hebbian (baseline)", dict(code_dim=16384, k=50)),
|
||||
]
|
||||
|
||||
# Test each architecture at different scales
|
||||
for bg in [0, 100, 500, 1000]:
|
||||
print(f"\n=== Background memories: {bg} ===")
|
||||
|
||||
# Baseline: flat Hebbian (no settling)
|
||||
class FlatHebbian:
|
||||
def __init__(self, input_dim, code_dim=16384, k=50):
|
||||
self.k = k
|
||||
self.code_dim = code_dim
|
||||
self.proj = (torch.randn(input_dim, code_dim, device=DEVICE)
|
||||
* (1.0 / input_dim**0.5))
|
||||
self.W = torch.zeros(code_dim, code_dim, device=DEVICE)
|
||||
def sep(self, x):
|
||||
return winner_take_all(x @ self.proj, self.k)
|
||||
def learn(self, c, t):
|
||||
self.W += torch.outer(self.sep(t), self.sep(c))
|
||||
def recall(self, q):
|
||||
code = self.sep(q)
|
||||
return winner_take_all(self.W @ code, self.k)
|
||||
|
||||
build_and_test(FlatHebbian, model, n_background=bg,
|
||||
label="Flat Hebbian", code_dim=16384, k=50)
|
||||
|
||||
# Approach 1: Autoassociative cleanup
|
||||
build_and_test(AttractorMemory, model, n_background=bg,
|
||||
label="Attractor (auto+hetero)", code_dim=16384, k=50)
|
||||
|
||||
# Approach 2: Modern Hopfield
|
||||
for beta in [4.0, 8.0, 16.0]:
|
||||
build_and_test(HopfieldMemory, model, n_background=bg,
|
||||
label=f"Hopfield (β={beta})", code_dim=16384, k=50,
|
||||
beta=beta)
|
||||
|
||||
# Approach 3: Recurrent with inhibition
|
||||
for inhib in [0.1, 0.5, 1.0]:
|
||||
build_and_test(RecurrentHebbianMemory, model, n_background=bg,
|
||||
label=f"Recurrent (inhib={inhib})", code_dim=16384, k=50,
|
||||
inhibition=inhib)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user