noc/mem/benchmarks/longmemeval_bench.py

"""LongMemEval benchmark for nocmem.

Evaluates retrieval quality: given a question, can nocmem find the correct
session(s) from a haystack of ~50 conversation sessions?

Uses HippocampalMemory directly (no HTTP) for speed.
Compares against MemPalace's 96.6% R@5 baseline.

Usage:
    uv run python benchmarks/longmemeval_bench.py [--limit N] [--granularity session|turn]
"""

import argparse
import json
import math
import sys
import time

import torch
from sentence_transformers import SentenceTransformer

from nuonuo.hippocampus import HippocampalMemory

# ── setup ───────────────────────────────────────────────────────────

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBED_MODEL = "all-MiniLM-L6-v2"
EMBED_DIM = 384


def load_encoder():
    print(f"loading {EMBED_MODEL} on {DEVICE}...")
    return SentenceTransformer(EMBED_MODEL, device=DEVICE)


def embed_batch(encoder, texts: list[str]) -> torch.Tensor:
    """Batch embed, returns (N, dim) tensor."""
    return encoder.encode(
        texts, convert_to_tensor=True, normalize_embeddings=True,
        device=DEVICE, batch_size=128, show_progress_bar=False,
    )


# ── granularity: how to chunk sessions ──────────────────────────────

def sessions_to_chunks_turn(session_ids, sessions):
    """Each user-assistant turn becomes a separate chunk."""
    chunks = []  # (text, session_id)
    for sid, sess in zip(session_ids, sessions):
        for i in range(0, len(sess) - 1, 2):
            user = sess[i]["content"]
            asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
            text = f"{user}\n{asst}"
            # truncate long turns to avoid embedding issues
            chunks.append((text[:1000], sid))
        # handle odd-numbered turns
        if len(sess) % 2 == 1:
            chunks.append((sess[-1]["content"][:1000], sid))
    return chunks


def sessions_to_chunks_session(session_ids, sessions):
    """Each session becomes a single chunk (concatenated turns)."""
    chunks = []
    for sid, sess in zip(session_ids, sessions):
        text = "\n".join(m["content"] for m in sess)
        # truncate to fit embedding model's context
        chunks.append((text[:2000], sid))
    return chunks


# ── evaluate one question ───────────────────────────────────────────

def evaluate_question(encoder, item, granularity, ks=(5, 10)):
    """Store haystack, query, check if answer session in top-K.

    Returns dict with R@5, R@10, NDCG@10, timings.
    """
    # chunk the haystack
    if granularity == "turn":
        chunks = sessions_to_chunks_turn(
            item["haystack_session_ids"], item["haystack_sessions"])
    else:
        chunks = sessions_to_chunks_session(
            item["haystack_session_ids"], item["haystack_sessions"])

    texts = [c[0] for c in chunks]
    sids = [c[1] for c in chunks]
    answer_sids = set(item["answer_session_ids"])

    # batch embed all chunks
    t0 = time.monotonic()
    embeddings = embed_batch(encoder, texts)
    embed_time = time.monotonic() - t0

    # build memory
    t1 = time.monotonic()
    hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
    for i in range(len(chunks)):
        hip.store(
            embeddings[i], embeddings[i],
            metadata={"session_id": sids[i]},
        )
    store_time = time.monotonic() - t1

    # query
    t2 = time.monotonic()
    query_emb = encoder.encode(
        [item["question"]], convert_to_tensor=True,
        normalize_embeddings=True, device=DEVICE,
    )[0]

    max_k = max(ks)
    results = hip.recall(query_emb, top_k=max_k)
    recall_time = time.monotonic() - t2

    # deduplicate by session_id, preserving rank order
    seen = set()
    ranked_sids = []
    for r in results:
        sid = r.metadata["session_id"]
        if sid not in seen:
            seen.add(sid)
            ranked_sids.append(sid)

    # compute metrics
    metrics = {}
    for k in ks:
        top_k_sids = set(ranked_sids[:k])
        hit = bool(answer_sids & top_k_sids)
        metrics[f"R@{k}"] = 1.0 if hit else 0.0

    # NDCG@10
    ndcg = compute_ndcg(ranked_sids[:10], answer_sids)
    metrics["NDCG@10"] = ndcg

    metrics["embed_ms"] = embed_time * 1000
    metrics["store_ms"] = store_time * 1000
    metrics["recall_ms"] = recall_time * 1000
    metrics["n_chunks"] = len(chunks)

    return metrics


def compute_ndcg(ranked_sids, answer_sids, k=10):
    """Normalized Discounted Cumulative Gain."""
    dcg = 0.0
    for i, sid in enumerate(ranked_sids[:k]):
        if sid in answer_sids:
            dcg += 1.0 / math.log2(i + 2)  # i+2 because rank starts at 1

    # ideal: all answer sessions at top
    n_relevant = min(len(answer_sids), k)
    idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant))

    return dcg / idcg if idcg > 0 else 0.0


# ── main ───<E29480><E29480>────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", default="benchmarks/longmemeval.json")
    parser.add_argument("--limit", type=int, default=0, help="limit number of questions (0=all)")
    parser.add_argument("--granularity", choices=["session", "turn"], default="turn")
    args = parser.parse_args()

    print(f"LongMemEval benchmark for nocmem")
    print(f"granularity: {args.granularity}")
    print(f"device: {DEVICE}")
    print()

    with open(args.data) as f:
        data = json.load(f)

    if args.limit:
        data = data[:args.limit]

    encoder = load_encoder()

    print(f"evaluating {len(data)} questions...\n")

    all_metrics = []
    by_type = {}

    for i, item in enumerate(data):
        metrics = evaluate_question(encoder, item, args.granularity)
        all_metrics.append(metrics)

        qtype = item["question_type"]
        if qtype not in by_type:
            by_type[qtype] = []
        by_type[qtype].append(metrics)

        # progress
        if (i + 1) % 10 == 0 or i == len(data) - 1:
            r5 = sum(m["R@5"] for m in all_metrics) / len(all_metrics) * 100
            r10 = sum(m["R@10"] for m in all_metrics) / len(all_metrics) * 100
            avg_recall = sum(m["recall_ms"] for m in all_metrics) / len(all_metrics)
            print(f"  [{i+1:3d}/{len(data)}] R@5={r5:.1f}%  R@10={r10:.1f}%  recall={avg_recall:.1f}ms")

    # final results
    n = len(all_metrics)
    r5 = sum(m["R@5"] for m in all_metrics) / n * 100
    r10 = sum(m["R@10"] for m in all_metrics) / n * 100
    ndcg = sum(m["NDCG@10"] for m in all_metrics) / n * 100
    avg_embed = sum(m["embed_ms"] for m in all_metrics) / n
    avg_store = sum(m["store_ms"] for m in all_metrics) / n
    avg_recall = sum(m["recall_ms"] for m in all_metrics) / n
    avg_chunks = sum(m["n_chunks"] for m in all_metrics) / n

    print(f"\n{'='*60}")
    print(f"nocmem LongMemEval Results ({args.granularity} granularity)")
    print(f"{'='*60}")
    print(f"  Questions:     {n}")
    print(f"  Avg chunks:    {avg_chunks:.0f}")
    print(f"")
    print(f"  R@5:           {r5:.1f}%")
    print(f"  R@10:          {r10:.1f}%")
    print(f"  NDCG@10:       {ndcg:.1f}%")
    print(f"")
    print(f"  Avg embed:     {avg_embed:.0f}ms")
    print(f"  Avg store:     {avg_store:.0f}ms")
    print(f"  Avg recall:    {avg_recall:.1f}ms")

    print(f"\n── by question type ──")
    for qtype, ms in sorted(by_type.items()):
        nt = len(ms)
        tr5 = sum(m["R@5"] for m in ms) / nt * 100
        tr10 = sum(m["R@10"] for m in ms) / nt * 100
        print(f"  {qtype:30s}  n={nt:3d}  R@5={tr5:.1f}%  R@10={tr10:.1f}%")

    print(f"\n── comparison ──")
    print(f"  MemPalace (raw, session):   R@5=96.6%")
    print(f"  nocmem ({args.granularity:7s}):         R@5={r5:.1f}%")


if __name__ == "__main__":
    main()