add nocmem: auto memory recall + ingest via NuoNuo hippocampal network
- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints - NOC integration: auto recall after user message (injected as system msg), async ingest after LLM response (fire-and-forget) - Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32), top_k=3, KV-cache friendly (appended after user msg, not in system prompt) - Ingest: LLM extraction + paraphrase augmentation, heuristic fallback - Wired into main.rs, life.rs (agent done), http.rs (api chat) - Config: optional `nocmem.endpoint` in config.yaml - Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale - Design doc: doc/nocmem.md
This commit is contained in:
345
mem/benchmarks/efficiency_bench.py
Normal file
345
mem/benchmarks/efficiency_bench.py
Normal file
@@ -0,0 +1,345 @@
|
||||
"""Efficiency benchmark for nocmem vs ChromaDB baseline.
|
||||
|
||||
Measures: storage size, memory usage, query latency, ingest throughput
|
||||
at various scales (100, 1K, 5K, 10K, 20K memories).
|
||||
|
||||
Usage:
|
||||
uv run python benchmarks/efficiency_bench.py
|
||||
"""
|
||||
|
||||
import gc
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import torch
|
||||
import psutil
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from nuonuo.hippocampus import HippocampalMemory
|
||||
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
EMBED_MODEL = "all-MiniLM-L6-v2"
|
||||
EMBED_DIM = 384
|
||||
|
||||
DATA_FILE = "benchmarks/longmemeval.json"
|
||||
|
||||
# ── helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
def get_process_mem_mb():
|
||||
return psutil.Process(os.getpid()).memory_info().rss / 1024**2
|
||||
|
||||
def get_gpu_mem_mb():
|
||||
if DEVICE != "cuda":
|
||||
return 0.0
|
||||
return torch.cuda.memory_allocated() / 1024**2
|
||||
|
||||
def file_size_mb(path):
|
||||
if os.path.exists(path):
|
||||
return os.path.getsize(path) / 1024**2
|
||||
return 0.0
|
||||
|
||||
def dir_size_mb(path):
|
||||
total = 0
|
||||
for dirpath, _, filenames in os.walk(path):
|
||||
for f in filenames:
|
||||
total += os.path.getsize(os.path.join(dirpath, f))
|
||||
return total / 1024**2
|
||||
|
||||
|
||||
# ── extract chunks from LongMemEval ────────────────────────────────
|
||||
|
||||
def load_chunks(max_chunks=25000):
|
||||
"""Extract turn-level chunks from LongMemEval data."""
|
||||
with open(DATA_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
chunks = []
|
||||
seen = set()
|
||||
for item in data:
|
||||
for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
|
||||
for i in range(0, len(sess) - 1, 2):
|
||||
key = (sid, i)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
user = sess[i]["content"]
|
||||
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
|
||||
text = f"{user}\n{asst}"[:1000]
|
||||
chunks.append(text)
|
||||
if len(chunks) >= max_chunks:
|
||||
return chunks
|
||||
return chunks
|
||||
|
||||
|
||||
# ── nocmem benchmark ────────────────────────────────────────────────
|
||||
|
||||
def bench_nocmem(encoder, chunks, n, query_texts):
|
||||
"""Benchmark nocmem at scale n."""
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
subset = chunks[:n]
|
||||
gpu_before = get_gpu_mem_mb()
|
||||
ram_before = get_process_mem_mb()
|
||||
|
||||
# batch embed
|
||||
t0 = time.monotonic()
|
||||
embeddings = encoder.encode(
|
||||
subset, convert_to_tensor=True, normalize_embeddings=True,
|
||||
device=DEVICE, batch_size=256, show_progress_bar=False,
|
||||
)
|
||||
embed_time = time.monotonic() - t0
|
||||
|
||||
# store
|
||||
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
|
||||
t1 = time.monotonic()
|
||||
for i in range(n):
|
||||
hip.store(embeddings[i], embeddings[i], metadata={"id": i})
|
||||
store_time = time.monotonic() - t1
|
||||
|
||||
gpu_after = get_gpu_mem_mb()
|
||||
ram_after = get_process_mem_mb()
|
||||
|
||||
# save to measure file size
|
||||
tmp = tempfile.mktemp(suffix=".pt")
|
||||
hip.save(tmp)
|
||||
disk_mb = file_size_mb(tmp)
|
||||
os.unlink(tmp)
|
||||
|
||||
# query latency — multiple queries, measure p50/p99
|
||||
query_embs = encoder.encode(
|
||||
query_texts, convert_to_tensor=True, normalize_embeddings=True,
|
||||
device=DEVICE, show_progress_bar=False,
|
||||
)
|
||||
latencies = []
|
||||
for qe in query_embs:
|
||||
t = time.monotonic()
|
||||
hip.recall(qe, top_k=5)
|
||||
latencies.append((time.monotonic() - t) * 1000)
|
||||
|
||||
latencies.sort()
|
||||
p50 = latencies[len(latencies) // 2]
|
||||
p99 = latencies[int(len(latencies) * 0.99)]
|
||||
avg = sum(latencies) / len(latencies)
|
||||
|
||||
# cleanup
|
||||
del hip, embeddings
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return {
|
||||
"n": n,
|
||||
"embed_time_s": embed_time,
|
||||
"store_time_s": store_time,
|
||||
"ingest_rate": n / (embed_time + store_time), # memories/sec
|
||||
"disk_mb": disk_mb,
|
||||
"gpu_delta_mb": gpu_after - gpu_before,
|
||||
"ram_delta_mb": ram_after - ram_before,
|
||||
"latency_avg_ms": avg,
|
||||
"latency_p50_ms": p50,
|
||||
"latency_p99_ms": p99,
|
||||
}
|
||||
|
||||
|
||||
# ── chromadb benchmark ──────────────────────────────────────────────
|
||||
|
||||
def bench_chromadb(encoder, chunks, n, query_texts):
|
||||
"""Benchmark ChromaDB (MemPalace's backend) at scale n."""
|
||||
import chromadb
|
||||
|
||||
subset = chunks[:n]
|
||||
ram_before = get_process_mem_mb()
|
||||
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
client = chromadb.PersistentClient(path=tmpdir)
|
||||
collection = client.create_collection(
|
||||
name="bench",
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
|
||||
# embed
|
||||
t0 = time.monotonic()
|
||||
embeddings_np = encoder.encode(
|
||||
subset, normalize_embeddings=True,
|
||||
batch_size=256, show_progress_bar=False,
|
||||
)
|
||||
embed_time = time.monotonic() - t0
|
||||
|
||||
# store — chromadb takes numpy/list
|
||||
t1 = time.monotonic()
|
||||
batch = 5000
|
||||
for start in range(0, n, batch):
|
||||
end = min(start + batch, n)
|
||||
collection.add(
|
||||
ids=[str(i) for i in range(start, end)],
|
||||
embeddings=embeddings_np[start:end].tolist(),
|
||||
documents=subset[start:end],
|
||||
)
|
||||
store_time = time.monotonic() - t1
|
||||
|
||||
ram_after = get_process_mem_mb()
|
||||
disk_mb = dir_size_mb(tmpdir)
|
||||
|
||||
# query latency
|
||||
query_np = encoder.encode(
|
||||
query_texts, normalize_embeddings=True, show_progress_bar=False,
|
||||
)
|
||||
latencies = []
|
||||
for qe in query_np:
|
||||
t = time.monotonic()
|
||||
collection.query(query_embeddings=[qe.tolist()], n_results=5)
|
||||
latencies.append((time.monotonic() - t) * 1000)
|
||||
|
||||
latencies.sort()
|
||||
p50 = latencies[len(latencies) // 2]
|
||||
p99 = latencies[int(len(latencies) * 0.99)]
|
||||
avg = sum(latencies) / len(latencies)
|
||||
|
||||
# cleanup
|
||||
del client, collection
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
return {
|
||||
"n": n,
|
||||
"embed_time_s": embed_time,
|
||||
"store_time_s": store_time,
|
||||
"ingest_rate": n / (embed_time + store_time),
|
||||
"disk_mb": disk_mb,
|
||||
"gpu_delta_mb": 0,
|
||||
"ram_delta_mb": ram_after - ram_before,
|
||||
"latency_avg_ms": avg,
|
||||
"latency_p50_ms": p50,
|
||||
"latency_p99_ms": p99,
|
||||
}
|
||||
|
||||
|
||||
# ── main ────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
print("nocmem efficiency benchmark")
|
||||
print(f"device: {DEVICE}")
|
||||
print()
|
||||
|
||||
# check chromadb available
|
||||
has_chromadb = False
|
||||
try:
|
||||
import chromadb
|
||||
has_chromadb = True
|
||||
print("chromadb: available (will compare)")
|
||||
except ImportError:
|
||||
print("chromadb: not installed (nocmem only)")
|
||||
print()
|
||||
|
||||
print("loading data...")
|
||||
chunks = load_chunks(25000)
|
||||
print(f" {len(chunks)} unique chunks extracted")
|
||||
|
||||
print("loading encoder...")
|
||||
encoder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
|
||||
|
||||
# query texts — mix of English and Chinese
|
||||
query_texts = [
|
||||
"What degree did I graduate with?",
|
||||
"How to deploy the application?",
|
||||
"What was the database error we fixed last week?",
|
||||
"Tell me about the meeting schedule",
|
||||
"What programming language should I learn?",
|
||||
"数据库密码在哪里",
|
||||
"部署到生产环境的步骤",
|
||||
"上次讨论的性能优化方案",
|
||||
"项目的技术栈是什么",
|
||||
"最近的待办事项有哪些",
|
||||
"How do I configure the server?",
|
||||
"What's the API endpoint for user authentication?",
|
||||
"Can you recommend some books on machine learning?",
|
||||
"What was the root cause of the production incident?",
|
||||
"How much memory does the GPU have?",
|
||||
"VR设备的兼容性问题",
|
||||
"模型推理的延迟是多少",
|
||||
"代码仓库的结构是怎样的",
|
||||
"如何解决内存泄漏",
|
||||
"上次会议的结论是什么",
|
||||
]
|
||||
|
||||
scales = [100, 500, 1000, 5000, 10000, 20000]
|
||||
# filter to what we have
|
||||
scales = [s for s in scales if s <= len(chunks)]
|
||||
|
||||
nocmem_results = []
|
||||
chroma_results = []
|
||||
|
||||
for n in scales:
|
||||
print(f"\n── scale: {n:,} memories ──")
|
||||
|
||||
print(f" nocmem...", end="", flush=True)
|
||||
r = bench_nocmem(encoder, chunks, n, query_texts)
|
||||
nocmem_results.append(r)
|
||||
print(f" done (R: {r['latency_avg_ms']:.1f}ms, disk: {r['disk_mb']:.1f}MB)")
|
||||
|
||||
if has_chromadb:
|
||||
print(f" chromadb...", end="", flush=True)
|
||||
r2 = bench_chromadb(encoder, chunks, n, query_texts)
|
||||
chroma_results.append(r2)
|
||||
print(f" done (R: {r2['latency_avg_ms']:.1f}ms, disk: {r2['disk_mb']:.1f}MB)")
|
||||
|
||||
# ── report ──────────────────────────────────────────────────────
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"EFFICIENCY BENCHMARK RESULTS")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# table header
|
||||
if has_chromadb:
|
||||
print(f"\n{'Scale':>8} | {'--- nocmem ---':^40} | {'--- ChromaDB ---':^40}")
|
||||
print(f"{'':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Rate':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'RAM':>8} {'Rate':>8}")
|
||||
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
|
||||
print("-" * 100)
|
||||
for nm, cr in zip(nocmem_results, chroma_results):
|
||||
print(
|
||||
f"{nm['n']:>8,} | "
|
||||
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f} | "
|
||||
f"{cr['latency_avg_ms']:>8.1f} {cr['latency_p99_ms']:>8.1f} {cr['disk_mb']:>8.1f} {cr['ram_delta_mb']:>8.1f} {cr['ingest_rate']:>8.0f}"
|
||||
)
|
||||
else:
|
||||
print(f"\n{'Scale':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Ingest':>8}")
|
||||
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
|
||||
print("-" * 60)
|
||||
for nm in nocmem_results:
|
||||
print(
|
||||
f"{nm['n']:>8,} | "
|
||||
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f}"
|
||||
)
|
||||
|
||||
# summary
|
||||
if nocmem_results:
|
||||
biggest = nocmem_results[-1]
|
||||
print(f"\nnocmem @ {biggest['n']:,}:")
|
||||
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
|
||||
print(f" Disk: {biggest['disk_mb']:.1f} MB")
|
||||
print(f" VRAM delta: {biggest['gpu_delta_mb']:.1f} MB")
|
||||
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
|
||||
|
||||
if chroma_results:
|
||||
biggest = chroma_results[-1]
|
||||
print(f"\nChromaDB @ {biggest['n']:,}:")
|
||||
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
|
||||
print(f" Disk: {biggest['disk_mb']:.1f} MB")
|
||||
print(f" RAM delta: {biggest['ram_delta_mb']:.1f} MB")
|
||||
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
|
||||
|
||||
if has_chromadb and nocmem_results and chroma_results:
|
||||
nm = nocmem_results[-1]
|
||||
cr = chroma_results[-1]
|
||||
print(f"\n── nocmem vs ChromaDB @ {nm['n']:,} ──")
|
||||
lat_ratio = cr['latency_avg_ms'] / nm['latency_avg_ms'] if nm['latency_avg_ms'] > 0 else float('inf')
|
||||
disk_ratio = cr['disk_mb'] / nm['disk_mb'] if nm['disk_mb'] > 0 else float('inf')
|
||||
rate_ratio = nm['ingest_rate'] / cr['ingest_rate'] if cr['ingest_rate'] > 0 else float('inf')
|
||||
print(f" Latency: nocmem {lat_ratio:.1f}x faster" if lat_ratio > 1 else f" Latency: ChromaDB {1/lat_ratio:.1f}x faster")
|
||||
print(f" Disk: nocmem {disk_ratio:.1f}x smaller" if disk_ratio > 1 else f" Disk: ChromaDB {1/disk_ratio:.1f}x smaller")
|
||||
print(f" Ingest: nocmem {rate_ratio:.1f}x faster" if rate_ratio > 1 else f" Ingest: ChromaDB {1/rate_ratio:.1f}x faster")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
239
mem/benchmarks/longmemeval_bench.py
Normal file
239
mem/benchmarks/longmemeval_bench.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"""LongMemEval benchmark for nocmem.
|
||||
|
||||
Evaluates retrieval quality: given a question, can nocmem find the correct
|
||||
session(s) from a haystack of ~50 conversation sessions?
|
||||
|
||||
Uses HippocampalMemory directly (no HTTP) for speed.
|
||||
Compares against MemPalace's 96.6% R@5 baseline.
|
||||
|
||||
Usage:
|
||||
uv run python benchmarks/longmemeval_bench.py [--limit N] [--granularity session|turn]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
import time
|
||||
|
||||
import torch
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from nuonuo.hippocampus import HippocampalMemory
|
||||
|
||||
# ── setup ───────────────────────────────────────────────────────────
|
||||
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
EMBED_MODEL = "all-MiniLM-L6-v2"
|
||||
EMBED_DIM = 384
|
||||
|
||||
|
||||
def load_encoder():
|
||||
print(f"loading {EMBED_MODEL} on {DEVICE}...")
|
||||
return SentenceTransformer(EMBED_MODEL, device=DEVICE)
|
||||
|
||||
|
||||
def embed_batch(encoder, texts: list[str]) -> torch.Tensor:
|
||||
"""Batch embed, returns (N, dim) tensor."""
|
||||
return encoder.encode(
|
||||
texts, convert_to_tensor=True, normalize_embeddings=True,
|
||||
device=DEVICE, batch_size=128, show_progress_bar=False,
|
||||
)
|
||||
|
||||
|
||||
# ── granularity: how to chunk sessions ──────────────────────────────
|
||||
|
||||
def sessions_to_chunks_turn(session_ids, sessions):
|
||||
"""Each user-assistant turn becomes a separate chunk."""
|
||||
chunks = [] # (text, session_id)
|
||||
for sid, sess in zip(session_ids, sessions):
|
||||
for i in range(0, len(sess) - 1, 2):
|
||||
user = sess[i]["content"]
|
||||
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
|
||||
text = f"{user}\n{asst}"
|
||||
# truncate long turns to avoid embedding issues
|
||||
chunks.append((text[:1000], sid))
|
||||
# handle odd-numbered turns
|
||||
if len(sess) % 2 == 1:
|
||||
chunks.append((sess[-1]["content"][:1000], sid))
|
||||
return chunks
|
||||
|
||||
|
||||
def sessions_to_chunks_session(session_ids, sessions):
|
||||
"""Each session becomes a single chunk (concatenated turns)."""
|
||||
chunks = []
|
||||
for sid, sess in zip(session_ids, sessions):
|
||||
text = "\n".join(m["content"] for m in sess)
|
||||
# truncate to fit embedding model's context
|
||||
chunks.append((text[:2000], sid))
|
||||
return chunks
|
||||
|
||||
|
||||
# ── evaluate one question ───────────────────────────────────────────
|
||||
|
||||
def evaluate_question(encoder, item, granularity, ks=(5, 10)):
|
||||
"""Store haystack, query, check if answer session in top-K.
|
||||
|
||||
Returns dict with R@5, R@10, NDCG@10, timings.
|
||||
"""
|
||||
# chunk the haystack
|
||||
if granularity == "turn":
|
||||
chunks = sessions_to_chunks_turn(
|
||||
item["haystack_session_ids"], item["haystack_sessions"])
|
||||
else:
|
||||
chunks = sessions_to_chunks_session(
|
||||
item["haystack_session_ids"], item["haystack_sessions"])
|
||||
|
||||
texts = [c[0] for c in chunks]
|
||||
sids = [c[1] for c in chunks]
|
||||
answer_sids = set(item["answer_session_ids"])
|
||||
|
||||
# batch embed all chunks
|
||||
t0 = time.monotonic()
|
||||
embeddings = embed_batch(encoder, texts)
|
||||
embed_time = time.monotonic() - t0
|
||||
|
||||
# build memory
|
||||
t1 = time.monotonic()
|
||||
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
|
||||
for i in range(len(chunks)):
|
||||
hip.store(
|
||||
embeddings[i], embeddings[i],
|
||||
metadata={"session_id": sids[i]},
|
||||
)
|
||||
store_time = time.monotonic() - t1
|
||||
|
||||
# query
|
||||
t2 = time.monotonic()
|
||||
query_emb = encoder.encode(
|
||||
[item["question"]], convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE,
|
||||
)[0]
|
||||
|
||||
max_k = max(ks)
|
||||
results = hip.recall(query_emb, top_k=max_k)
|
||||
recall_time = time.monotonic() - t2
|
||||
|
||||
# deduplicate by session_id, preserving rank order
|
||||
seen = set()
|
||||
ranked_sids = []
|
||||
for r in results:
|
||||
sid = r.metadata["session_id"]
|
||||
if sid not in seen:
|
||||
seen.add(sid)
|
||||
ranked_sids.append(sid)
|
||||
|
||||
# compute metrics
|
||||
metrics = {}
|
||||
for k in ks:
|
||||
top_k_sids = set(ranked_sids[:k])
|
||||
hit = bool(answer_sids & top_k_sids)
|
||||
metrics[f"R@{k}"] = 1.0 if hit else 0.0
|
||||
|
||||
# NDCG@10
|
||||
ndcg = compute_ndcg(ranked_sids[:10], answer_sids)
|
||||
metrics["NDCG@10"] = ndcg
|
||||
|
||||
metrics["embed_ms"] = embed_time * 1000
|
||||
metrics["store_ms"] = store_time * 1000
|
||||
metrics["recall_ms"] = recall_time * 1000
|
||||
metrics["n_chunks"] = len(chunks)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def compute_ndcg(ranked_sids, answer_sids, k=10):
|
||||
"""Normalized Discounted Cumulative Gain."""
|
||||
dcg = 0.0
|
||||
for i, sid in enumerate(ranked_sids[:k]):
|
||||
if sid in answer_sids:
|
||||
dcg += 1.0 / math.log2(i + 2) # i+2 because rank starts at 1
|
||||
|
||||
# ideal: all answer sessions at top
|
||||
n_relevant = min(len(answer_sids), k)
|
||||
idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant))
|
||||
|
||||
return dcg / idcg if idcg > 0 else 0.0
|
||||
|
||||
|
||||
# ── main ───<E29480><E29480>────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--data", default="benchmarks/longmemeval.json")
|
||||
parser.add_argument("--limit", type=int, default=0, help="limit number of questions (0=all)")
|
||||
parser.add_argument("--granularity", choices=["session", "turn"], default="turn")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"LongMemEval benchmark for nocmem")
|
||||
print(f"granularity: {args.granularity}")
|
||||
print(f"device: {DEVICE}")
|
||||
print()
|
||||
|
||||
with open(args.data) as f:
|
||||
data = json.load(f)
|
||||
|
||||
if args.limit:
|
||||
data = data[:args.limit]
|
||||
|
||||
encoder = load_encoder()
|
||||
|
||||
print(f"evaluating {len(data)} questions...\n")
|
||||
|
||||
all_metrics = []
|
||||
by_type = {}
|
||||
|
||||
for i, item in enumerate(data):
|
||||
metrics = evaluate_question(encoder, item, args.granularity)
|
||||
all_metrics.append(metrics)
|
||||
|
||||
qtype = item["question_type"]
|
||||
if qtype not in by_type:
|
||||
by_type[qtype] = []
|
||||
by_type[qtype].append(metrics)
|
||||
|
||||
# progress
|
||||
if (i + 1) % 10 == 0 or i == len(data) - 1:
|
||||
r5 = sum(m["R@5"] for m in all_metrics) / len(all_metrics) * 100
|
||||
r10 = sum(m["R@10"] for m in all_metrics) / len(all_metrics) * 100
|
||||
avg_recall = sum(m["recall_ms"] for m in all_metrics) / len(all_metrics)
|
||||
print(f" [{i+1:3d}/{len(data)}] R@5={r5:.1f}% R@10={r10:.1f}% recall={avg_recall:.1f}ms")
|
||||
|
||||
# final results
|
||||
n = len(all_metrics)
|
||||
r5 = sum(m["R@5"] for m in all_metrics) / n * 100
|
||||
r10 = sum(m["R@10"] for m in all_metrics) / n * 100
|
||||
ndcg = sum(m["NDCG@10"] for m in all_metrics) / n * 100
|
||||
avg_embed = sum(m["embed_ms"] for m in all_metrics) / n
|
||||
avg_store = sum(m["store_ms"] for m in all_metrics) / n
|
||||
avg_recall = sum(m["recall_ms"] for m in all_metrics) / n
|
||||
avg_chunks = sum(m["n_chunks"] for m in all_metrics) / n
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"nocmem LongMemEval Results ({args.granularity} granularity)")
|
||||
print(f"{'='*60}")
|
||||
print(f" Questions: {n}")
|
||||
print(f" Avg chunks: {avg_chunks:.0f}")
|
||||
print(f"")
|
||||
print(f" R@5: {r5:.1f}%")
|
||||
print(f" R@10: {r10:.1f}%")
|
||||
print(f" NDCG@10: {ndcg:.1f}%")
|
||||
print(f"")
|
||||
print(f" Avg embed: {avg_embed:.0f}ms")
|
||||
print(f" Avg store: {avg_store:.0f}ms")
|
||||
print(f" Avg recall: {avg_recall:.1f}ms")
|
||||
|
||||
print(f"\n── by question type ──")
|
||||
for qtype, ms in sorted(by_type.items()):
|
||||
nt = len(ms)
|
||||
tr5 = sum(m["R@5"] for m in ms) / nt * 100
|
||||
tr10 = sum(m["R@10"] for m in ms) / nt * 100
|
||||
print(f" {qtype:30s} n={nt:3d} R@5={tr5:.1f}% R@10={tr10:.1f}%")
|
||||
|
||||
print(f"\n── comparison ──")
|
||||
print(f" MemPalace (raw, session): R@5=96.6%")
|
||||
print(f" nocmem ({args.granularity:7s}): R@5={r5:.1f}%")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
178
mem/benchmarks/noise_vs_scale.py
Normal file
178
mem/benchmarks/noise_vs_scale.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""Does recall noise decrease as memory count grows?
|
||||
|
||||
At various scales, measure:
|
||||
1. Recall accuracy (R@3) for relevant queries
|
||||
2. Max cosine similarity for irrelevant queries
|
||||
3. Separation gap between relevant and irrelevant
|
||||
|
||||
If nocmem works well at scale, the gap should widen — relevant queries
|
||||
should score much higher than irrelevant ones as the memory pool grows.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import torch
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from nuonuo.hippocampus import HippocampalMemory
|
||||
|
||||
DEVICE = "cuda"
|
||||
EMBED_DIM = 384
|
||||
DATA_FILE = "benchmarks/longmemeval.json"
|
||||
|
||||
IRRELEVANT_QUERIES = [
|
||||
"今天天气怎么样",
|
||||
"你喜欢吃什么",
|
||||
"嗨",
|
||||
"讲个笑话",
|
||||
"明天会下雨吗",
|
||||
"你觉得猫可爱还是狗可爱",
|
||||
"人生的意义是什么",
|
||||
"帮我写一首诗",
|
||||
"地球到月球有多远",
|
||||
"如何学会游泳",
|
||||
]
|
||||
|
||||
BETA_CONFIGS = [16.0, 32.0, 64.0]
|
||||
SCALES = [50, 200, 500, 1000, 3000]
|
||||
|
||||
|
||||
def main():
|
||||
print("noise vs scale benchmark\n")
|
||||
print("loading encoder...")
|
||||
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
|
||||
|
||||
def emb(text):
|
||||
return encoder.encode([text], convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE)[0]
|
||||
|
||||
def emb_batch(texts):
|
||||
return encoder.encode(texts, convert_to_tensor=True,
|
||||
normalize_embeddings=True, device=DEVICE,
|
||||
batch_size=256, show_progress_bar=False)
|
||||
|
||||
# load data
|
||||
print("loading data...")
|
||||
with open(DATA_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# collect unique chunks with their source question index
|
||||
all_chunks = [] # (text, question_idx, session_id)
|
||||
seen = set()
|
||||
for qi, item in enumerate(data):
|
||||
for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
|
||||
for i in range(0, len(sess) - 1, 2):
|
||||
key = (sid, i)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
user = sess[i]["content"]
|
||||
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
|
||||
text = f"{user}\n{asst}"[:1000]
|
||||
all_chunks.append((text, qi, sid))
|
||||
print(f" {len(all_chunks)} unique chunks")
|
||||
|
||||
# pre-embed irrelevant queries
|
||||
irrel_embs = [emb(q) for q in IRRELEVANT_QUERIES]
|
||||
|
||||
# collect relevant queries: for each question, we know the answer session
|
||||
# pick first 50 questions that have at least one answer session
|
||||
relevant_queries = []
|
||||
for item in data[:100]:
|
||||
answer_sids = set(item["answer_session_ids"])
|
||||
relevant_queries.append((item["question"], answer_sids))
|
||||
rel_query_embs = emb_batch([q for q, _ in relevant_queries])
|
||||
|
||||
print(f" {len(relevant_queries)} relevant queries")
|
||||
print(f" {len(IRRELEVANT_QUERIES)} irrelevant queries")
|
||||
|
||||
# filter scales to what we have
|
||||
scales = [s for s in SCALES if s <= len(all_chunks)]
|
||||
|
||||
for beta in BETA_CONFIGS:
|
||||
print(f"\n{'='*70}")
|
||||
print(f" β = {beta}")
|
||||
print(f"{'='*70}")
|
||||
print(f"{'Scale':>7} | {'R@3':>6} | {'Rel maxcos':>10} {'Irrel maxcos':>12} {'Gap':>8} | {'Rel attn':>9} {'Irrel attn':>11}")
|
||||
print("-" * 80)
|
||||
|
||||
for n in scales:
|
||||
subset = all_chunks[:n]
|
||||
texts = [c[0] for c in subset]
|
||||
sids = [c[2] for c in subset]
|
||||
|
||||
# embed and build memory
|
||||
embeddings = emb_batch(texts)
|
||||
hip = HippocampalMemory(
|
||||
embed_dim=EMBED_DIM, beta=beta, hopfield_top_k=10, device=DEVICE,
|
||||
)
|
||||
for i in range(n):
|
||||
hip.store(embeddings[i], embeddings[i],
|
||||
metadata={"session_id": sids[i]})
|
||||
|
||||
cue_mat = hip._get_cue_matrix()
|
||||
|
||||
# --- relevant queries ---
|
||||
rel_max_cos = []
|
||||
rel_top_attn = []
|
||||
hits = 0
|
||||
tested = 0
|
||||
|
||||
for qi in range(len(relevant_queries)):
|
||||
question, answer_sids = relevant_queries[qi]
|
||||
qe = rel_query_embs[qi]
|
||||
|
||||
# check if any answer session is in this subset
|
||||
subset_sids = set(sids)
|
||||
if not (answer_sids & subset_sids):
|
||||
continue
|
||||
tested += 1
|
||||
|
||||
# cosine sim
|
||||
cos_sims = qe @ cue_mat.T
|
||||
rel_max_cos.append(cos_sims.max().item())
|
||||
|
||||
# recall
|
||||
results = hip.recall(qe, top_k=3)
|
||||
top_attn = results[0].similarity if results else 0
|
||||
rel_top_attn.append(top_attn)
|
||||
|
||||
recalled_sids = {r.metadata["session_id"] for r in results}
|
||||
if answer_sids & recalled_sids:
|
||||
hits += 1
|
||||
|
||||
r3 = hits / tested * 100 if tested > 0 else 0
|
||||
avg_rel_cos = np.mean(rel_max_cos) if rel_max_cos else 0
|
||||
avg_rel_attn = np.mean(rel_top_attn) if rel_top_attn else 0
|
||||
|
||||
# --- irrelevant queries ---
|
||||
irrel_max_cos = []
|
||||
irrel_top_attn = []
|
||||
for qe in irrel_embs:
|
||||
cos_sims = qe @ cue_mat.T
|
||||
irrel_max_cos.append(cos_sims.max().item())
|
||||
|
||||
results = hip.recall(qe, top_k=3)
|
||||
top_attn = results[0].similarity if results else 0
|
||||
irrel_top_attn.append(top_attn)
|
||||
|
||||
avg_irrel_cos = np.mean(irrel_max_cos)
|
||||
avg_irrel_attn = np.mean(irrel_top_attn)
|
||||
|
||||
gap = avg_rel_cos - avg_irrel_cos
|
||||
|
||||
print(f"{n:>7,} | {r3:>5.1f}% | {avg_rel_cos:>10.3f} {avg_irrel_cos:>12.3f} {gap:>8.3f} | {avg_rel_attn:>8.0%} {avg_irrel_attn:>10.0%}")
|
||||
|
||||
del hip
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
print(f"\n── 解读 ──")
|
||||
print(f"Rel maxcos: 相关查询的最大余弦相似度(越高越好)")
|
||||
print(f"Irrel maxcos: 无关查询的最大余弦相似度(越低越好)")
|
||||
print(f"Gap: 两者之差(越大越好 = 越容易区分)")
|
||||
print(f"Rel attn: 相关查询 top1 的 Hopfield attention 权重")
|
||||
print(f"Irrel attn: 无关查询 top1 的 Hopfield attention 权重(越低 = 越少噪音)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
104
mem/benchmarks/sharpness_test.py
Normal file
104
mem/benchmarks/sharpness_test.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""Test Hopfield attention sharpness with different top_k and beta.
|
||||
|
||||
Goal: find settings that give "either clearly remembered or nothing"
|
||||
instead of flat attention across 20 candidates.
|
||||
"""
|
||||
|
||||
import torch
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from nuonuo.hippocampus import HippocampalMemory
|
||||
|
||||
DEVICE = "cuda"
|
||||
EMBED_DIM = 384
|
||||
|
||||
print("loading encoder...")
|
||||
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
|
||||
|
||||
def emb(text):
|
||||
return encoder.encode([text], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0]
|
||||
|
||||
|
||||
# store the same memories in each config
|
||||
MEMORIES = [
|
||||
("bot的名字叫什么", "bot的名字叫小乖,是Fam给取的"),
|
||||
("有哪些工具可以用", "工具有: fam_todo, send_file, spawn_agent, run_shell, run_python, update_memory"),
|
||||
("vLLM在5090上的性能", "RTX 5090上vLLM跑gemma只有4.8 tok/s,需要切换到awq_marlin"),
|
||||
("repo-vis项目是什么", "repo-vis用Rust后端+Three.js前端的3D代码库可视化,目标支持Linux内核和Pico VR"),
|
||||
("repo-vis的性能瓶颈", "Linux内核79K文件,SQLite 1GB上限和O(n)反序列化是瓶颈,需要n-ary tree按需合并"),
|
||||
("明天的待办事项", "最紧迫的是emblem scanner的AI Chat和KB部分"),
|
||||
("后端切换到了什么", "NOC后端切换到了vLLM,速度变快了"),
|
||||
("数据库密码在哪里", "数据库密码存在 /etc/secrets/db.env 文件中"),
|
||||
("什么GPU", "服务器有NVIDIA RTX 4090 24GB VRAM"),
|
||||
("home有多少log文件", "home目录及子目录下共有960个.log文件"),
|
||||
]
|
||||
|
||||
QUERIES = [
|
||||
("repo-vis怎么样了", "repo-vis", True), # should recall clearly
|
||||
("数据库密码", "密码", True), # should recall clearly
|
||||
("今天天气怎么样", "天气", False), # irrelevant, should recall nothing
|
||||
("vllm速度", "vllm", True), # should recall clearly
|
||||
("你喜欢吃什么", "吃什么", False), # irrelevant
|
||||
("VR支持", "VR", True), # edge case
|
||||
]
|
||||
|
||||
CONFIGS = [
|
||||
# (top_k, beta, label)
|
||||
(20, 16.0, "baseline (top_k=20, β=16)"),
|
||||
(10, 16.0, "top_k=10, β=16"),
|
||||
(5, 16.0, "top_k=5, β=16"),
|
||||
(20, 32.0, "top_k=20, β=32"),
|
||||
(20, 64.0, "top_k=20, β=64"),
|
||||
(10, 32.0, "top_k=10, β=32"),
|
||||
(5, 32.0, "top_k=5, β=32"),
|
||||
(5, 64.0, "top_k=5, β=64"),
|
||||
]
|
||||
|
||||
# pre-embed everything
|
||||
mem_embs = [(emb(c), emb(t), c, t) for c, t in MEMORIES]
|
||||
query_embs = [(emb(q), label, relevant) for q, label, relevant in QUERIES]
|
||||
|
||||
print(f"\n{len(MEMORIES)} memories, {len(QUERIES)} queries, {len(CONFIGS)} configs\n")
|
||||
|
||||
for top_k, beta, label in CONFIGS:
|
||||
print(f"{'='*70}")
|
||||
print(f" {label}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
hip = HippocampalMemory(
|
||||
embed_dim=EMBED_DIM, hopfield_top_k=top_k, beta=beta, device=DEVICE,
|
||||
)
|
||||
for ce, te, cue_text, target_text in mem_embs:
|
||||
hip.store(ce, te, metadata={"cue": cue_text, "target": target_text})
|
||||
|
||||
for qe, qlabel, should_recall in query_embs:
|
||||
results = hip.recall(qe, top_k=5)
|
||||
|
||||
# show distribution
|
||||
sims = [r.similarity for r in results]
|
||||
top1 = sims[0] if sims else 0
|
||||
top2 = sims[1] if len(sims) > 1 else 0
|
||||
gap = top1 - top2 # gap between #1 and #2
|
||||
above_5pct = sum(1 for s in sims if s >= 0.05)
|
||||
above_10pct = sum(1 for s in sims if s >= 0.10)
|
||||
|
||||
top_target = results[0].metadata["target"][:40] if results else "—"
|
||||
tag = "✓" if should_recall else "✗"
|
||||
|
||||
print(f" [{tag}] {qlabel:10s} top1={top1:.0%} top2={top2:.0%} gap={gap:.0%} "
|
||||
f"≥5%:{above_5pct} ≥10%:{above_10pct} → {top_target}")
|
||||
|
||||
# summary: average sharpness
|
||||
total_gap = 0
|
||||
total_top1 = 0
|
||||
for qe, qlabel, _ in query_embs:
|
||||
results = hip.recall(qe, top_k=5)
|
||||
sims = [r.similarity for r in results]
|
||||
total_top1 += sims[0] if sims else 0
|
||||
total_gap += (sims[0] - sims[1]) if len(sims) > 1 else 0
|
||||
|
||||
n = len(query_embs)
|
||||
print(f"\n avg top1={total_top1/n:.0%} avg gap={total_gap/n:.0%}")
|
||||
print()
|
||||
|
||||
del hip
|
||||
torch.cuda.empty_cache()
|
||||
Reference in New Issue
Block a user