"""LongMemEval benchmark for nocmem. Evaluates retrieval quality: given a question, can nocmem find the correct session(s) from a haystack of ~50 conversation sessions? Uses HippocampalMemory directly (no HTTP) for speed. Compares against MemPalace's 96.6% R@5 baseline. Usage: uv run python benchmarks/longmemeval_bench.py [--limit N] [--granularity session|turn] """ import argparse import json import math import sys import time import torch from sentence_transformers import SentenceTransformer from nuonuo.hippocampus import HippocampalMemory # ── setup ─────────────────────────────────────────────────────────── DEVICE = "cuda" if torch.cuda.is_available() else "cpu" EMBED_MODEL = "all-MiniLM-L6-v2" EMBED_DIM = 384 def load_encoder(): print(f"loading {EMBED_MODEL} on {DEVICE}...") return SentenceTransformer(EMBED_MODEL, device=DEVICE) def embed_batch(encoder, texts: list[str]) -> torch.Tensor: """Batch embed, returns (N, dim) tensor.""" return encoder.encode( texts, convert_to_tensor=True, normalize_embeddings=True, device=DEVICE, batch_size=128, show_progress_bar=False, ) # ── granularity: how to chunk sessions ────────────────────────────── def sessions_to_chunks_turn(session_ids, sessions): """Each user-assistant turn becomes a separate chunk.""" chunks = [] # (text, session_id) for sid, sess in zip(session_ids, sessions): for i in range(0, len(sess) - 1, 2): user = sess[i]["content"] asst = sess[i + 1]["content"] if i + 1 < len(sess) else "" text = f"{user}\n{asst}" # truncate long turns to avoid embedding issues chunks.append((text[:1000], sid)) # handle odd-numbered turns if len(sess) % 2 == 1: chunks.append((sess[-1]["content"][:1000], sid)) return chunks def sessions_to_chunks_session(session_ids, sessions): """Each session becomes a single chunk (concatenated turns).""" chunks = [] for sid, sess in zip(session_ids, sessions): text = "\n".join(m["content"] for m in sess) # truncate to fit embedding model's context chunks.append((text[:2000], sid)) return chunks # ── evaluate one question ─────────────────────────────────────────── def evaluate_question(encoder, item, granularity, ks=(5, 10)): """Store haystack, query, check if answer session in top-K. Returns dict with R@5, R@10, NDCG@10, timings. """ # chunk the haystack if granularity == "turn": chunks = sessions_to_chunks_turn( item["haystack_session_ids"], item["haystack_sessions"]) else: chunks = sessions_to_chunks_session( item["haystack_session_ids"], item["haystack_sessions"]) texts = [c[0] for c in chunks] sids = [c[1] for c in chunks] answer_sids = set(item["answer_session_ids"]) # batch embed all chunks t0 = time.monotonic() embeddings = embed_batch(encoder, texts) embed_time = time.monotonic() - t0 # build memory t1 = time.monotonic() hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE) for i in range(len(chunks)): hip.store( embeddings[i], embeddings[i], metadata={"session_id": sids[i]}, ) store_time = time.monotonic() - t1 # query t2 = time.monotonic() query_emb = encoder.encode( [item["question"]], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE, )[0] max_k = max(ks) results = hip.recall(query_emb, top_k=max_k) recall_time = time.monotonic() - t2 # deduplicate by session_id, preserving rank order seen = set() ranked_sids = [] for r in results: sid = r.metadata["session_id"] if sid not in seen: seen.add(sid) ranked_sids.append(sid) # compute metrics metrics = {} for k in ks: top_k_sids = set(ranked_sids[:k]) hit = bool(answer_sids & top_k_sids) metrics[f"R@{k}"] = 1.0 if hit else 0.0 # NDCG@10 ndcg = compute_ndcg(ranked_sids[:10], answer_sids) metrics["NDCG@10"] = ndcg metrics["embed_ms"] = embed_time * 1000 metrics["store_ms"] = store_time * 1000 metrics["recall_ms"] = recall_time * 1000 metrics["n_chunks"] = len(chunks) return metrics def compute_ndcg(ranked_sids, answer_sids, k=10): """Normalized Discounted Cumulative Gain.""" dcg = 0.0 for i, sid in enumerate(ranked_sids[:k]): if sid in answer_sids: dcg += 1.0 / math.log2(i + 2) # i+2 because rank starts at 1 # ideal: all answer sessions at top n_relevant = min(len(answer_sids), k) idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant)) return dcg / idcg if idcg > 0 else 0.0 # ── main ───��──────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser() parser.add_argument("--data", default="benchmarks/longmemeval.json") parser.add_argument("--limit", type=int, default=0, help="limit number of questions (0=all)") parser.add_argument("--granularity", choices=["session", "turn"], default="turn") args = parser.parse_args() print(f"LongMemEval benchmark for nocmem") print(f"granularity: {args.granularity}") print(f"device: {DEVICE}") print() with open(args.data) as f: data = json.load(f) if args.limit: data = data[:args.limit] encoder = load_encoder() print(f"evaluating {len(data)} questions...\n") all_metrics = [] by_type = {} for i, item in enumerate(data): metrics = evaluate_question(encoder, item, args.granularity) all_metrics.append(metrics) qtype = item["question_type"] if qtype not in by_type: by_type[qtype] = [] by_type[qtype].append(metrics) # progress if (i + 1) % 10 == 0 or i == len(data) - 1: r5 = sum(m["R@5"] for m in all_metrics) / len(all_metrics) * 100 r10 = sum(m["R@10"] for m in all_metrics) / len(all_metrics) * 100 avg_recall = sum(m["recall_ms"] for m in all_metrics) / len(all_metrics) print(f" [{i+1:3d}/{len(data)}] R@5={r5:.1f}% R@10={r10:.1f}% recall={avg_recall:.1f}ms") # final results n = len(all_metrics) r5 = sum(m["R@5"] for m in all_metrics) / n * 100 r10 = sum(m["R@10"] for m in all_metrics) / n * 100 ndcg = sum(m["NDCG@10"] for m in all_metrics) / n * 100 avg_embed = sum(m["embed_ms"] for m in all_metrics) / n avg_store = sum(m["store_ms"] for m in all_metrics) / n avg_recall = sum(m["recall_ms"] for m in all_metrics) / n avg_chunks = sum(m["n_chunks"] for m in all_metrics) / n print(f"\n{'='*60}") print(f"nocmem LongMemEval Results ({args.granularity} granularity)") print(f"{'='*60}") print(f" Questions: {n}") print(f" Avg chunks: {avg_chunks:.0f}") print(f"") print(f" R@5: {r5:.1f}%") print(f" R@10: {r10:.1f}%") print(f" NDCG@10: {ndcg:.1f}%") print(f"") print(f" Avg embed: {avg_embed:.0f}ms") print(f" Avg store: {avg_store:.0f}ms") print(f" Avg recall: {avg_recall:.1f}ms") print(f"\n── by question type ──") for qtype, ms in sorted(by_type.items()): nt = len(ms) tr5 = sum(m["R@5"] for m in ms) / nt * 100 tr10 = sum(m["R@10"] for m in ms) / nt * 100 print(f" {qtype:30s} n={nt:3d} R@5={tr5:.1f}% R@10={tr10:.1f}%") print(f"\n── comparison ──") print(f" MemPalace (raw, session): R@5=96.6%") print(f" nocmem ({args.granularity:7s}): R@5={r5:.1f}%") if __name__ == "__main__": main()