Files
noc/mem/benchmarks/longmemeval_bench.py
Fam Zheng 7000ccda0f add nocmem: auto memory recall + ingest via NuoNuo hippocampal network
- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's
  Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints
- NOC integration: auto recall after user message (injected as system msg),
  async ingest after LLM response (fire-and-forget)
- Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32),
  top_k=3, KV-cache friendly (appended after user msg, not in system prompt)
- Ingest: LLM extraction + paraphrase augmentation, heuristic fallback
- Wired into main.rs, life.rs (agent done), http.rs (api chat)
- Config: optional `nocmem.endpoint` in config.yaml
- Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale
- Design doc: doc/nocmem.md
2026-04-11 12:24:48 +01:00

240 lines
8.0 KiB
Python
Raw Blame History

"""LongMemEval benchmark for nocmem.
Evaluates retrieval quality: given a question, can nocmem find the correct
session(s) from a haystack of ~50 conversation sessions?
Uses HippocampalMemory directly (no HTTP) for speed.
Compares against MemPalace's 96.6% R@5 baseline.
Usage:
uv run python benchmarks/longmemeval_bench.py [--limit N] [--granularity session|turn]
"""
import argparse
import json
import math
import sys
import time
import torch
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
# ── setup ───────────────────────────────────────────────────────────
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBED_MODEL = "all-MiniLM-L6-v2"
EMBED_DIM = 384
def load_encoder():
print(f"loading {EMBED_MODEL} on {DEVICE}...")
return SentenceTransformer(EMBED_MODEL, device=DEVICE)
def embed_batch(encoder, texts: list[str]) -> torch.Tensor:
"""Batch embed, returns (N, dim) tensor."""
return encoder.encode(
texts, convert_to_tensor=True, normalize_embeddings=True,
device=DEVICE, batch_size=128, show_progress_bar=False,
)
# ── granularity: how to chunk sessions ──────────────────────────────
def sessions_to_chunks_turn(session_ids, sessions):
"""Each user-assistant turn becomes a separate chunk."""
chunks = [] # (text, session_id)
for sid, sess in zip(session_ids, sessions):
for i in range(0, len(sess) - 1, 2):
user = sess[i]["content"]
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
text = f"{user}\n{asst}"
# truncate long turns to avoid embedding issues
chunks.append((text[:1000], sid))
# handle odd-numbered turns
if len(sess) % 2 == 1:
chunks.append((sess[-1]["content"][:1000], sid))
return chunks
def sessions_to_chunks_session(session_ids, sessions):
"""Each session becomes a single chunk (concatenated turns)."""
chunks = []
for sid, sess in zip(session_ids, sessions):
text = "\n".join(m["content"] for m in sess)
# truncate to fit embedding model's context
chunks.append((text[:2000], sid))
return chunks
# ── evaluate one question ───────────────────────────────────────────
def evaluate_question(encoder, item, granularity, ks=(5, 10)):
"""Store haystack, query, check if answer session in top-K.
Returns dict with R@5, R@10, NDCG@10, timings.
"""
# chunk the haystack
if granularity == "turn":
chunks = sessions_to_chunks_turn(
item["haystack_session_ids"], item["haystack_sessions"])
else:
chunks = sessions_to_chunks_session(
item["haystack_session_ids"], item["haystack_sessions"])
texts = [c[0] for c in chunks]
sids = [c[1] for c in chunks]
answer_sids = set(item["answer_session_ids"])
# batch embed all chunks
t0 = time.monotonic()
embeddings = embed_batch(encoder, texts)
embed_time = time.monotonic() - t0
# build memory
t1 = time.monotonic()
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
for i in range(len(chunks)):
hip.store(
embeddings[i], embeddings[i],
metadata={"session_id": sids[i]},
)
store_time = time.monotonic() - t1
# query
t2 = time.monotonic()
query_emb = encoder.encode(
[item["question"]], convert_to_tensor=True,
normalize_embeddings=True, device=DEVICE,
)[0]
max_k = max(ks)
results = hip.recall(query_emb, top_k=max_k)
recall_time = time.monotonic() - t2
# deduplicate by session_id, preserving rank order
seen = set()
ranked_sids = []
for r in results:
sid = r.metadata["session_id"]
if sid not in seen:
seen.add(sid)
ranked_sids.append(sid)
# compute metrics
metrics = {}
for k in ks:
top_k_sids = set(ranked_sids[:k])
hit = bool(answer_sids & top_k_sids)
metrics[f"R@{k}"] = 1.0 if hit else 0.0
# NDCG@10
ndcg = compute_ndcg(ranked_sids[:10], answer_sids)
metrics["NDCG@10"] = ndcg
metrics["embed_ms"] = embed_time * 1000
metrics["store_ms"] = store_time * 1000
metrics["recall_ms"] = recall_time * 1000
metrics["n_chunks"] = len(chunks)
return metrics
def compute_ndcg(ranked_sids, answer_sids, k=10):
"""Normalized Discounted Cumulative Gain."""
dcg = 0.0
for i, sid in enumerate(ranked_sids[:k]):
if sid in answer_sids:
dcg += 1.0 / math.log2(i + 2) # i+2 because rank starts at 1
# ideal: all answer sessions at top
n_relevant = min(len(answer_sids), k)
idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant))
return dcg / idcg if idcg > 0 else 0.0
# ── main ───<E29480><E29480>────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data", default="benchmarks/longmemeval.json")
parser.add_argument("--limit", type=int, default=0, help="limit number of questions (0=all)")
parser.add_argument("--granularity", choices=["session", "turn"], default="turn")
args = parser.parse_args()
print(f"LongMemEval benchmark for nocmem")
print(f"granularity: {args.granularity}")
print(f"device: {DEVICE}")
print()
with open(args.data) as f:
data = json.load(f)
if args.limit:
data = data[:args.limit]
encoder = load_encoder()
print(f"evaluating {len(data)} questions...\n")
all_metrics = []
by_type = {}
for i, item in enumerate(data):
metrics = evaluate_question(encoder, item, args.granularity)
all_metrics.append(metrics)
qtype = item["question_type"]
if qtype not in by_type:
by_type[qtype] = []
by_type[qtype].append(metrics)
# progress
if (i + 1) % 10 == 0 or i == len(data) - 1:
r5 = sum(m["R@5"] for m in all_metrics) / len(all_metrics) * 100
r10 = sum(m["R@10"] for m in all_metrics) / len(all_metrics) * 100
avg_recall = sum(m["recall_ms"] for m in all_metrics) / len(all_metrics)
print(f" [{i+1:3d}/{len(data)}] R@5={r5:.1f}% R@10={r10:.1f}% recall={avg_recall:.1f}ms")
# final results
n = len(all_metrics)
r5 = sum(m["R@5"] for m in all_metrics) / n * 100
r10 = sum(m["R@10"] for m in all_metrics) / n * 100
ndcg = sum(m["NDCG@10"] for m in all_metrics) / n * 100
avg_embed = sum(m["embed_ms"] for m in all_metrics) / n
avg_store = sum(m["store_ms"] for m in all_metrics) / n
avg_recall = sum(m["recall_ms"] for m in all_metrics) / n
avg_chunks = sum(m["n_chunks"] for m in all_metrics) / n
print(f"\n{'='*60}")
print(f"nocmem LongMemEval Results ({args.granularity} granularity)")
print(f"{'='*60}")
print(f" Questions: {n}")
print(f" Avg chunks: {avg_chunks:.0f}")
print(f"")
print(f" R@5: {r5:.1f}%")
print(f" R@10: {r10:.1f}%")
print(f" NDCG@10: {ndcg:.1f}%")
print(f"")
print(f" Avg embed: {avg_embed:.0f}ms")
print(f" Avg store: {avg_store:.0f}ms")
print(f" Avg recall: {avg_recall:.1f}ms")
print(f"\n── by question type ──")
for qtype, ms in sorted(by_type.items()):
nt = len(ms)
tr5 = sum(m["R@5"] for m in ms) / nt * 100
tr10 = sum(m["R@10"] for m in ms) / nt * 100
print(f" {qtype:30s} n={nt:3d} R@5={tr5:.1f}% R@10={tr10:.1f}%")
print(f"\n── comparison ──")
print(f" MemPalace (raw, session): R@5=96.6%")
print(f" nocmem ({args.granularity:7s}): R@5={r5:.1f}%")
if __name__ == "__main__":
main()