add nocmem: auto memory recall + ingest via NuoNuo hippocampal network

- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's
  Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints
- NOC integration: auto recall after user message (injected as system msg),
  async ingest after LLM response (fire-and-forget)
- Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32),
  top_k=3, KV-cache friendly (appended after user msg, not in system prompt)
- Ingest: LLM extraction + paraphrase augmentation, heuristic fallback
- Wired into main.rs, life.rs (agent done), http.rs (api chat)
- Config: optional `nocmem.endpoint` in config.yaml
- Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale
- Design doc: doc/nocmem.md
This commit is contained in:
Fam Zheng
2026-04-11 12:24:48 +01:00
parent 688387dac3
commit 7000ccda0f
17 changed files with 4164 additions and 3 deletions

View File

@@ -0,0 +1,345 @@
"""Efficiency benchmark for nocmem vs ChromaDB baseline.
Measures: storage size, memory usage, query latency, ingest throughput
at various scales (100, 1K, 5K, 10K, 20K memories).
Usage:
uv run python benchmarks/efficiency_bench.py
"""
import gc
import os
import json
import shutil
import tempfile
import time
import torch
import psutil
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBED_MODEL = "all-MiniLM-L6-v2"
EMBED_DIM = 384
DATA_FILE = "benchmarks/longmemeval.json"
# ── helpers ─────────────────────────────────────────────────────────
def get_process_mem_mb():
return psutil.Process(os.getpid()).memory_info().rss / 1024**2
def get_gpu_mem_mb():
if DEVICE != "cuda":
return 0.0
return torch.cuda.memory_allocated() / 1024**2
def file_size_mb(path):
if os.path.exists(path):
return os.path.getsize(path) / 1024**2
return 0.0
def dir_size_mb(path):
total = 0
for dirpath, _, filenames in os.walk(path):
for f in filenames:
total += os.path.getsize(os.path.join(dirpath, f))
return total / 1024**2
# ── extract chunks from LongMemEval ────────────────────────────────
def load_chunks(max_chunks=25000):
"""Extract turn-level chunks from LongMemEval data."""
with open(DATA_FILE) as f:
data = json.load(f)
chunks = []
seen = set()
for item in data:
for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
for i in range(0, len(sess) - 1, 2):
key = (sid, i)
if key in seen:
continue
seen.add(key)
user = sess[i]["content"]
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
text = f"{user}\n{asst}"[:1000]
chunks.append(text)
if len(chunks) >= max_chunks:
return chunks
return chunks
# ── nocmem benchmark ────────────────────────────────────────────────
def bench_nocmem(encoder, chunks, n, query_texts):
"""Benchmark nocmem at scale n."""
torch.cuda.empty_cache()
gc.collect()
subset = chunks[:n]
gpu_before = get_gpu_mem_mb()
ram_before = get_process_mem_mb()
# batch embed
t0 = time.monotonic()
embeddings = encoder.encode(
subset, convert_to_tensor=True, normalize_embeddings=True,
device=DEVICE, batch_size=256, show_progress_bar=False,
)
embed_time = time.monotonic() - t0
# store
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
t1 = time.monotonic()
for i in range(n):
hip.store(embeddings[i], embeddings[i], metadata={"id": i})
store_time = time.monotonic() - t1
gpu_after = get_gpu_mem_mb()
ram_after = get_process_mem_mb()
# save to measure file size
tmp = tempfile.mktemp(suffix=".pt")
hip.save(tmp)
disk_mb = file_size_mb(tmp)
os.unlink(tmp)
# query latency — multiple queries, measure p50/p99
query_embs = encoder.encode(
query_texts, convert_to_tensor=True, normalize_embeddings=True,
device=DEVICE, show_progress_bar=False,
)
latencies = []
for qe in query_embs:
t = time.monotonic()
hip.recall(qe, top_k=5)
latencies.append((time.monotonic() - t) * 1000)
latencies.sort()
p50 = latencies[len(latencies) // 2]
p99 = latencies[int(len(latencies) * 0.99)]
avg = sum(latencies) / len(latencies)
# cleanup
del hip, embeddings
torch.cuda.empty_cache()
return {
"n": n,
"embed_time_s": embed_time,
"store_time_s": store_time,
"ingest_rate": n / (embed_time + store_time), # memories/sec
"disk_mb": disk_mb,
"gpu_delta_mb": gpu_after - gpu_before,
"ram_delta_mb": ram_after - ram_before,
"latency_avg_ms": avg,
"latency_p50_ms": p50,
"latency_p99_ms": p99,
}
# ── chromadb benchmark ──────────────────────────────────────────────
def bench_chromadb(encoder, chunks, n, query_texts):
"""Benchmark ChromaDB (MemPalace's backend) at scale n."""
import chromadb
subset = chunks[:n]
ram_before = get_process_mem_mb()
tmpdir = tempfile.mkdtemp()
client = chromadb.PersistentClient(path=tmpdir)
collection = client.create_collection(
name="bench",
metadata={"hnsw:space": "cosine"},
)
# embed
t0 = time.monotonic()
embeddings_np = encoder.encode(
subset, normalize_embeddings=True,
batch_size=256, show_progress_bar=False,
)
embed_time = time.monotonic() - t0
# store — chromadb takes numpy/list
t1 = time.monotonic()
batch = 5000
for start in range(0, n, batch):
end = min(start + batch, n)
collection.add(
ids=[str(i) for i in range(start, end)],
embeddings=embeddings_np[start:end].tolist(),
documents=subset[start:end],
)
store_time = time.monotonic() - t1
ram_after = get_process_mem_mb()
disk_mb = dir_size_mb(tmpdir)
# query latency
query_np = encoder.encode(
query_texts, normalize_embeddings=True, show_progress_bar=False,
)
latencies = []
for qe in query_np:
t = time.monotonic()
collection.query(query_embeddings=[qe.tolist()], n_results=5)
latencies.append((time.monotonic() - t) * 1000)
latencies.sort()
p50 = latencies[len(latencies) // 2]
p99 = latencies[int(len(latencies) * 0.99)]
avg = sum(latencies) / len(latencies)
# cleanup
del client, collection
shutil.rmtree(tmpdir)
return {
"n": n,
"embed_time_s": embed_time,
"store_time_s": store_time,
"ingest_rate": n / (embed_time + store_time),
"disk_mb": disk_mb,
"gpu_delta_mb": 0,
"ram_delta_mb": ram_after - ram_before,
"latency_avg_ms": avg,
"latency_p50_ms": p50,
"latency_p99_ms": p99,
}
# ── main ────────────────────────────────────────────────────────────
def main():
print("nocmem efficiency benchmark")
print(f"device: {DEVICE}")
print()
# check chromadb available
has_chromadb = False
try:
import chromadb
has_chromadb = True
print("chromadb: available (will compare)")
except ImportError:
print("chromadb: not installed (nocmem only)")
print()
print("loading data...")
chunks = load_chunks(25000)
print(f" {len(chunks)} unique chunks extracted")
print("loading encoder...")
encoder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
# query texts — mix of English and Chinese
query_texts = [
"What degree did I graduate with?",
"How to deploy the application?",
"What was the database error we fixed last week?",
"Tell me about the meeting schedule",
"What programming language should I learn?",
"数据库密码在哪里",
"部署到生产环境的步骤",
"上次讨论的性能优化方案",
"项目的技术栈是什么",
"最近的待办事项有哪些",
"How do I configure the server?",
"What's the API endpoint for user authentication?",
"Can you recommend some books on machine learning?",
"What was the root cause of the production incident?",
"How much memory does the GPU have?",
"VR设备的兼容性问题",
"模型推理的延迟是多少",
"代码仓库的结构是怎样的",
"如何解决内存泄漏",
"上次会议的结论是什么",
]
scales = [100, 500, 1000, 5000, 10000, 20000]
# filter to what we have
scales = [s for s in scales if s <= len(chunks)]
nocmem_results = []
chroma_results = []
for n in scales:
print(f"\n── scale: {n:,} memories ──")
print(f" nocmem...", end="", flush=True)
r = bench_nocmem(encoder, chunks, n, query_texts)
nocmem_results.append(r)
print(f" done (R: {r['latency_avg_ms']:.1f}ms, disk: {r['disk_mb']:.1f}MB)")
if has_chromadb:
print(f" chromadb...", end="", flush=True)
r2 = bench_chromadb(encoder, chunks, n, query_texts)
chroma_results.append(r2)
print(f" done (R: {r2['latency_avg_ms']:.1f}ms, disk: {r2['disk_mb']:.1f}MB)")
# ── report ──────────────────────────────────────────────────────
print(f"\n{'='*80}")
print(f"EFFICIENCY BENCHMARK RESULTS")
print(f"{'='*80}")
# table header
if has_chromadb:
print(f"\n{'Scale':>8} | {'--- nocmem ---':^40} | {'--- ChromaDB ---':^40}")
print(f"{'':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Rate':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'RAM':>8} {'Rate':>8}")
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
print("-" * 100)
for nm, cr in zip(nocmem_results, chroma_results):
print(
f"{nm['n']:>8,} | "
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f} | "
f"{cr['latency_avg_ms']:>8.1f} {cr['latency_p99_ms']:>8.1f} {cr['disk_mb']:>8.1f} {cr['ram_delta_mb']:>8.1f} {cr['ingest_rate']:>8.0f}"
)
else:
print(f"\n{'Scale':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Ingest':>8}")
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
print("-" * 60)
for nm in nocmem_results:
print(
f"{nm['n']:>8,} | "
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f}"
)
# summary
if nocmem_results:
biggest = nocmem_results[-1]
print(f"\nnocmem @ {biggest['n']:,}:")
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
print(f" Disk: {biggest['disk_mb']:.1f} MB")
print(f" VRAM delta: {biggest['gpu_delta_mb']:.1f} MB")
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
if chroma_results:
biggest = chroma_results[-1]
print(f"\nChromaDB @ {biggest['n']:,}:")
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
print(f" Disk: {biggest['disk_mb']:.1f} MB")
print(f" RAM delta: {biggest['ram_delta_mb']:.1f} MB")
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
if has_chromadb and nocmem_results and chroma_results:
nm = nocmem_results[-1]
cr = chroma_results[-1]
print(f"\n── nocmem vs ChromaDB @ {nm['n']:,} ──")
lat_ratio = cr['latency_avg_ms'] / nm['latency_avg_ms'] if nm['latency_avg_ms'] > 0 else float('inf')
disk_ratio = cr['disk_mb'] / nm['disk_mb'] if nm['disk_mb'] > 0 else float('inf')
rate_ratio = nm['ingest_rate'] / cr['ingest_rate'] if cr['ingest_rate'] > 0 else float('inf')
print(f" Latency: nocmem {lat_ratio:.1f}x faster" if lat_ratio > 1 else f" Latency: ChromaDB {1/lat_ratio:.1f}x faster")
print(f" Disk: nocmem {disk_ratio:.1f}x smaller" if disk_ratio > 1 else f" Disk: ChromaDB {1/disk_ratio:.1f}x smaller")
print(f" Ingest: nocmem {rate_ratio:.1f}x faster" if rate_ratio > 1 else f" Ingest: ChromaDB {1/rate_ratio:.1f}x faster")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,239 @@
"""LongMemEval benchmark for nocmem.
Evaluates retrieval quality: given a question, can nocmem find the correct
session(s) from a haystack of ~50 conversation sessions?
Uses HippocampalMemory directly (no HTTP) for speed.
Compares against MemPalace's 96.6% R@5 baseline.
Usage:
uv run python benchmarks/longmemeval_bench.py [--limit N] [--granularity session|turn]
"""
import argparse
import json
import math
import sys
import time
import torch
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
# ── setup ───────────────────────────────────────────────────────────
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBED_MODEL = "all-MiniLM-L6-v2"
EMBED_DIM = 384
def load_encoder():
print(f"loading {EMBED_MODEL} on {DEVICE}...")
return SentenceTransformer(EMBED_MODEL, device=DEVICE)
def embed_batch(encoder, texts: list[str]) -> torch.Tensor:
"""Batch embed, returns (N, dim) tensor."""
return encoder.encode(
texts, convert_to_tensor=True, normalize_embeddings=True,
device=DEVICE, batch_size=128, show_progress_bar=False,
)
# ── granularity: how to chunk sessions ──────────────────────────────
def sessions_to_chunks_turn(session_ids, sessions):
"""Each user-assistant turn becomes a separate chunk."""
chunks = [] # (text, session_id)
for sid, sess in zip(session_ids, sessions):
for i in range(0, len(sess) - 1, 2):
user = sess[i]["content"]
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
text = f"{user}\n{asst}"
# truncate long turns to avoid embedding issues
chunks.append((text[:1000], sid))
# handle odd-numbered turns
if len(sess) % 2 == 1:
chunks.append((sess[-1]["content"][:1000], sid))
return chunks
def sessions_to_chunks_session(session_ids, sessions):
"""Each session becomes a single chunk (concatenated turns)."""
chunks = []
for sid, sess in zip(session_ids, sessions):
text = "\n".join(m["content"] for m in sess)
# truncate to fit embedding model's context
chunks.append((text[:2000], sid))
return chunks
# ── evaluate one question ───────────────────────────────────────────
def evaluate_question(encoder, item, granularity, ks=(5, 10)):
"""Store haystack, query, check if answer session in top-K.
Returns dict with R@5, R@10, NDCG@10, timings.
"""
# chunk the haystack
if granularity == "turn":
chunks = sessions_to_chunks_turn(
item["haystack_session_ids"], item["haystack_sessions"])
else:
chunks = sessions_to_chunks_session(
item["haystack_session_ids"], item["haystack_sessions"])
texts = [c[0] for c in chunks]
sids = [c[1] for c in chunks]
answer_sids = set(item["answer_session_ids"])
# batch embed all chunks
t0 = time.monotonic()
embeddings = embed_batch(encoder, texts)
embed_time = time.monotonic() - t0
# build memory
t1 = time.monotonic()
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
for i in range(len(chunks)):
hip.store(
embeddings[i], embeddings[i],
metadata={"session_id": sids[i]},
)
store_time = time.monotonic() - t1
# query
t2 = time.monotonic()
query_emb = encoder.encode(
[item["question"]], convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE,
)[0]
max_k = max(ks)
results = hip.recall(query_emb, top_k=max_k)
recall_time = time.monotonic() - t2
# deduplicate by session_id, preserving rank order
seen = set()
ranked_sids = []
for r in results:
sid = r.metadata["session_id"]
if sid not in seen:
seen.add(sid)
ranked_sids.append(sid)
# compute metrics
metrics = {}
for k in ks:
top_k_sids = set(ranked_sids[:k])
hit = bool(answer_sids & top_k_sids)
metrics[f"R@{k}"] = 1.0 if hit else 0.0
# NDCG@10
ndcg = compute_ndcg(ranked_sids[:10], answer_sids)
metrics["NDCG@10"] = ndcg
metrics["embed_ms"] = embed_time * 1000
metrics["store_ms"] = store_time * 1000
metrics["recall_ms"] = recall_time * 1000
metrics["n_chunks"] = len(chunks)
return metrics
def compute_ndcg(ranked_sids, answer_sids, k=10):
"""Normalized Discounted Cumulative Gain."""
dcg = 0.0
for i, sid in enumerate(ranked_sids[:k]):
if sid in answer_sids:
dcg += 1.0 / math.log2(i + 2) # i+2 because rank starts at 1
# ideal: all answer sessions at top
n_relevant = min(len(answer_sids), k)
idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant))
return dcg / idcg if idcg > 0 else 0.0
# ── main ───<E29480><E29480>────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data", default="benchmarks/longmemeval.json")
parser.add_argument("--limit", type=int, default=0, help="limit number of questions (0=all)")
parser.add_argument("--granularity", choices=["session", "turn"], default="turn")
args = parser.parse_args()
print(f"LongMemEval benchmark for nocmem")
print(f"granularity: {args.granularity}")
print(f"device: {DEVICE}")
print()
with open(args.data) as f:
data = json.load(f)
if args.limit:
data = data[:args.limit]
encoder = load_encoder()
print(f"evaluating {len(data)} questions...\n")
all_metrics = []
by_type = {}
for i, item in enumerate(data):
metrics = evaluate_question(encoder, item, args.granularity)
all_metrics.append(metrics)
qtype = item["question_type"]
if qtype not in by_type:
by_type[qtype] = []
by_type[qtype].append(metrics)
# progress
if (i + 1) % 10 == 0 or i == len(data) - 1:
r5 = sum(m["R@5"] for m in all_metrics) / len(all_metrics) * 100
r10 = sum(m["R@10"] for m in all_metrics) / len(all_metrics) * 100
avg_recall = sum(m["recall_ms"] for m in all_metrics) / len(all_metrics)
print(f" [{i+1:3d}/{len(data)}] R@5={r5:.1f}% R@10={r10:.1f}% recall={avg_recall:.1f}ms")
# final results
n = len(all_metrics)
r5 = sum(m["R@5"] for m in all_metrics) / n * 100
r10 = sum(m["R@10"] for m in all_metrics) / n * 100
ndcg = sum(m["NDCG@10"] for m in all_metrics) / n * 100
avg_embed = sum(m["embed_ms"] for m in all_metrics) / n
avg_store = sum(m["store_ms"] for m in all_metrics) / n
avg_recall = sum(m["recall_ms"] for m in all_metrics) / n
avg_chunks = sum(m["n_chunks"] for m in all_metrics) / n
print(f"\n{'='*60}")
print(f"nocmem LongMemEval Results ({args.granularity} granularity)")
print(f"{'='*60}")
print(f" Questions: {n}")
print(f" Avg chunks: {avg_chunks:.0f}")
print(f"")
print(f" R@5: {r5:.1f}%")
print(f" R@10: {r10:.1f}%")
print(f" NDCG@10: {ndcg:.1f}%")
print(f"")
print(f" Avg embed: {avg_embed:.0f}ms")
print(f" Avg store: {avg_store:.0f}ms")
print(f" Avg recall: {avg_recall:.1f}ms")
print(f"\n── by question type ──")
for qtype, ms in sorted(by_type.items()):
nt = len(ms)
tr5 = sum(m["R@5"] for m in ms) / nt * 100
tr10 = sum(m["R@10"] for m in ms) / nt * 100
print(f" {qtype:30s} n={nt:3d} R@5={tr5:.1f}% R@10={tr10:.1f}%")
print(f"\n── comparison ──")
print(f" MemPalace (raw, session): R@5=96.6%")
print(f" nocmem ({args.granularity:7s}): R@5={r5:.1f}%")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,178 @@
"""Does recall noise decrease as memory count grows?
At various scales, measure:
1. Recall accuracy (R@3) for relevant queries
2. Max cosine similarity for irrelevant queries
3. Separation gap between relevant and irrelevant
If nocmem works well at scale, the gap should widen — relevant queries
should score much higher than irrelevant ones as the memory pool grows.
"""
import json
import time
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
DEVICE = "cuda"
EMBED_DIM = 384
DATA_FILE = "benchmarks/longmemeval.json"
IRRELEVANT_QUERIES = [
"今天天气怎么样",
"你喜欢吃什么",
"",
"讲个笑话",
"明天会下雨吗",
"你觉得猫可爱还是狗可爱",
"人生的意义是什么",
"帮我写一首诗",
"地球到月球有多远",
"如何学会游泳",
]
BETA_CONFIGS = [16.0, 32.0, 64.0]
SCALES = [50, 200, 500, 1000, 3000]
def main():
print("noise vs scale benchmark\n")
print("loading encoder...")
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
def emb(text):
return encoder.encode([text], convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE)[0]
def emb_batch(texts):
return encoder.encode(texts, convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE,
batch_size=256, show_progress_bar=False)
# load data
print("loading data...")
with open(DATA_FILE) as f:
data = json.load(f)
# collect unique chunks with their source question index
all_chunks = [] # (text, question_idx, session_id)
seen = set()
for qi, item in enumerate(data):
for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
for i in range(0, len(sess) - 1, 2):
key = (sid, i)
if key in seen:
continue
seen.add(key)
user = sess[i]["content"]
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
text = f"{user}\n{asst}"[:1000]
all_chunks.append((text, qi, sid))
print(f" {len(all_chunks)} unique chunks")
# pre-embed irrelevant queries
irrel_embs = [emb(q) for q in IRRELEVANT_QUERIES]
# collect relevant queries: for each question, we know the answer session
# pick first 50 questions that have at least one answer session
relevant_queries = []
for item in data[:100]:
answer_sids = set(item["answer_session_ids"])
relevant_queries.append((item["question"], answer_sids))
rel_query_embs = emb_batch([q for q, _ in relevant_queries])
print(f" {len(relevant_queries)} relevant queries")
print(f" {len(IRRELEVANT_QUERIES)} irrelevant queries")
# filter scales to what we have
scales = [s for s in SCALES if s <= len(all_chunks)]
for beta in BETA_CONFIGS:
print(f"\n{'='*70}")
print(f" β = {beta}")
print(f"{'='*70}")
print(f"{'Scale':>7} | {'R@3':>6} | {'Rel maxcos':>10} {'Irrel maxcos':>12} {'Gap':>8} | {'Rel attn':>9} {'Irrel attn':>11}")
print("-" * 80)
for n in scales:
subset = all_chunks[:n]
texts = [c[0] for c in subset]
sids = [c[2] for c in subset]
# embed and build memory
embeddings = emb_batch(texts)
hip = HippocampalMemory(
embed_dim=EMBED_DIM, beta=beta, hopfield_top_k=10, device=DEVICE,
)
for i in range(n):
hip.store(embeddings[i], embeddings[i],
metadata={"session_id": sids[i]})
cue_mat = hip._get_cue_matrix()
# --- relevant queries ---
rel_max_cos = []
rel_top_attn = []
hits = 0
tested = 0
for qi in range(len(relevant_queries)):
question, answer_sids = relevant_queries[qi]
qe = rel_query_embs[qi]
# check if any answer session is in this subset
subset_sids = set(sids)
if not (answer_sids & subset_sids):
continue
tested += 1
# cosine sim
cos_sims = qe @ cue_mat.T
rel_max_cos.append(cos_sims.max().item())
# recall
results = hip.recall(qe, top_k=3)
top_attn = results[0].similarity if results else 0
rel_top_attn.append(top_attn)
recalled_sids = {r.metadata["session_id"] for r in results}
if answer_sids & recalled_sids:
hits += 1
r3 = hits / tested * 100 if tested > 0 else 0
avg_rel_cos = np.mean(rel_max_cos) if rel_max_cos else 0
avg_rel_attn = np.mean(rel_top_attn) if rel_top_attn else 0
# --- irrelevant queries ---
irrel_max_cos = []
irrel_top_attn = []
for qe in irrel_embs:
cos_sims = qe @ cue_mat.T
irrel_max_cos.append(cos_sims.max().item())
results = hip.recall(qe, top_k=3)
top_attn = results[0].similarity if results else 0
irrel_top_attn.append(top_attn)
avg_irrel_cos = np.mean(irrel_max_cos)
avg_irrel_attn = np.mean(irrel_top_attn)
gap = avg_rel_cos - avg_irrel_cos
print(f"{n:>7,} | {r3:>5.1f}% | {avg_rel_cos:>10.3f} {avg_irrel_cos:>12.3f} {gap:>8.3f} | {avg_rel_attn:>8.0%} {avg_irrel_attn:>10.0%}")
del hip
torch.cuda.empty_cache()
print(f"\n── 解读 ──")
print(f"Rel maxcos: 相关查询的最大余弦相似度(越高越好)")
print(f"Irrel maxcos: 无关查询的最大余弦相似度(越低越好)")
print(f"Gap: 两者之差(越大越好 = 越容易区分)")
print(f"Rel attn: 相关查询 top1 的 Hopfield attention 权重")
print(f"Irrel attn: 无关查询 top1 的 Hopfield attention 权重(越低 = 越少噪音)")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,104 @@
"""Test Hopfield attention sharpness with different top_k and beta.
Goal: find settings that give "either clearly remembered or nothing"
instead of flat attention across 20 candidates.
"""
import torch
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
DEVICE = "cuda"
EMBED_DIM = 384
print("loading encoder...")
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
def emb(text):
return encoder.encode([text], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0]
# store the same memories in each config
MEMORIES = [
("bot的名字叫什么", "bot的名字叫小乖是Fam给取的"),
("有哪些工具可以用", "工具有: fam_todo, send_file, spawn_agent, run_shell, run_python, update_memory"),
("vLLM在5090上的性能", "RTX 5090上vLLM跑gemma只有4.8 tok/s需要切换到awq_marlin"),
("repo-vis项目是什么", "repo-vis用Rust后端+Three.js前端的3D代码库可视化目标支持Linux内核和Pico VR"),
("repo-vis的性能瓶颈", "Linux内核79K文件SQLite 1GB上限和O(n)反序列化是瓶颈需要n-ary tree按需合并"),
("明天的待办事项", "最紧迫的是emblem scanner的AI Chat和KB部分"),
("后端切换到了什么", "NOC后端切换到了vLLM速度变快了"),
("数据库密码在哪里", "数据库密码存在 /etc/secrets/db.env 文件中"),
("什么GPU", "服务器有NVIDIA RTX 4090 24GB VRAM"),
("home有多少log文件", "home目录及子目录下共有960个.log文件"),
]
QUERIES = [
("repo-vis怎么样了", "repo-vis", True), # should recall clearly
("数据库密码", "密码", True), # should recall clearly
("今天天气怎么样", "天气", False), # irrelevant, should recall nothing
("vllm速度", "vllm", True), # should recall clearly
("你喜欢吃什么", "吃什么", False), # irrelevant
("VR支持", "VR", True), # edge case
]
CONFIGS = [
# (top_k, beta, label)
(20, 16.0, "baseline (top_k=20, β=16)"),
(10, 16.0, "top_k=10, β=16"),
(5, 16.0, "top_k=5, β=16"),
(20, 32.0, "top_k=20, β=32"),
(20, 64.0, "top_k=20, β=64"),
(10, 32.0, "top_k=10, β=32"),
(5, 32.0, "top_k=5, β=32"),
(5, 64.0, "top_k=5, β=64"),
]
# pre-embed everything
mem_embs = [(emb(c), emb(t), c, t) for c, t in MEMORIES]
query_embs = [(emb(q), label, relevant) for q, label, relevant in QUERIES]
print(f"\n{len(MEMORIES)} memories, {len(QUERIES)} queries, {len(CONFIGS)} configs\n")
for top_k, beta, label in CONFIGS:
print(f"{'='*70}")
print(f" {label}")
print(f"{'='*70}")
hip = HippocampalMemory(
embed_dim=EMBED_DIM, hopfield_top_k=top_k, beta=beta, device=DEVICE,
)
for ce, te, cue_text, target_text in mem_embs:
hip.store(ce, te, metadata={"cue": cue_text, "target": target_text})
for qe, qlabel, should_recall in query_embs:
results = hip.recall(qe, top_k=5)
# show distribution
sims = [r.similarity for r in results]
top1 = sims[0] if sims else 0
top2 = sims[1] if len(sims) > 1 else 0
gap = top1 - top2 # gap between #1 and #2
above_5pct = sum(1 for s in sims if s >= 0.05)
above_10pct = sum(1 for s in sims if s >= 0.10)
top_target = results[0].metadata["target"][:40] if results else ""
tag = "" if should_recall else ""
print(f" [{tag}] {qlabel:10s} top1={top1:.0%} top2={top2:.0%} gap={gap:.0%} "
f"≥5%:{above_5pct} ≥10%:{above_10pct}{top_target}")
# summary: average sharpness
total_gap = 0
total_top1 = 0
for qe, qlabel, _ in query_embs:
results = hip.recall(qe, top_k=5)
sims = [r.similarity for r in results]
total_top1 += sims[0] if sims else 0
total_gap += (sims[0] - sims[1]) if len(sims) > 1 else 0
n = len(query_embs)
print(f"\n avg top1={total_top1/n:.0%} avg gap={total_gap/n:.0%}")
print()
del hip
torch.cuda.empty_cache()