- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints - NOC integration: auto recall after user message (injected as system msg), async ingest after LLM response (fire-and-forget) - Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32), top_k=3, KV-cache friendly (appended after user msg, not in system prompt) - Ingest: LLM extraction + paraphrase augmentation, heuristic fallback - Wired into main.rs, life.rs (agent done), http.rs (api chat) - Config: optional `nocmem.endpoint` in config.yaml - Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale - Design doc: doc/nocmem.md
240 lines
8.0 KiB
Python
240 lines
8.0 KiB
Python
"""LongMemEval benchmark for nocmem.
|
||
|
||
Evaluates retrieval quality: given a question, can nocmem find the correct
|
||
session(s) from a haystack of ~50 conversation sessions?
|
||
|
||
Uses HippocampalMemory directly (no HTTP) for speed.
|
||
Compares against MemPalace's 96.6% R@5 baseline.
|
||
|
||
Usage:
|
||
uv run python benchmarks/longmemeval_bench.py [--limit N] [--granularity session|turn]
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import math
|
||
import sys
|
||
import time
|
||
|
||
import torch
|
||
from sentence_transformers import SentenceTransformer
|
||
|
||
from nuonuo.hippocampus import HippocampalMemory
|
||
|
||
# ── setup ───────────────────────────────────────────────────────────
|
||
|
||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||
EMBED_MODEL = "all-MiniLM-L6-v2"
|
||
EMBED_DIM = 384
|
||
|
||
|
||
def load_encoder():
|
||
print(f"loading {EMBED_MODEL} on {DEVICE}...")
|
||
return SentenceTransformer(EMBED_MODEL, device=DEVICE)
|
||
|
||
|
||
def embed_batch(encoder, texts: list[str]) -> torch.Tensor:
|
||
"""Batch embed, returns (N, dim) tensor."""
|
||
return encoder.encode(
|
||
texts, convert_to_tensor=True, normalize_embeddings=True,
|
||
device=DEVICE, batch_size=128, show_progress_bar=False,
|
||
)
|
||
|
||
|
||
# ── granularity: how to chunk sessions ──────────────────────────────
|
||
|
||
def sessions_to_chunks_turn(session_ids, sessions):
|
||
"""Each user-assistant turn becomes a separate chunk."""
|
||
chunks = [] # (text, session_id)
|
||
for sid, sess in zip(session_ids, sessions):
|
||
for i in range(0, len(sess) - 1, 2):
|
||
user = sess[i]["content"]
|
||
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
|
||
text = f"{user}\n{asst}"
|
||
# truncate long turns to avoid embedding issues
|
||
chunks.append((text[:1000], sid))
|
||
# handle odd-numbered turns
|
||
if len(sess) % 2 == 1:
|
||
chunks.append((sess[-1]["content"][:1000], sid))
|
||
return chunks
|
||
|
||
|
||
def sessions_to_chunks_session(session_ids, sessions):
|
||
"""Each session becomes a single chunk (concatenated turns)."""
|
||
chunks = []
|
||
for sid, sess in zip(session_ids, sessions):
|
||
text = "\n".join(m["content"] for m in sess)
|
||
# truncate to fit embedding model's context
|
||
chunks.append((text[:2000], sid))
|
||
return chunks
|
||
|
||
|
||
# ── evaluate one question ───────────────────────────────────────────
|
||
|
||
def evaluate_question(encoder, item, granularity, ks=(5, 10)):
|
||
"""Store haystack, query, check if answer session in top-K.
|
||
|
||
Returns dict with R@5, R@10, NDCG@10, timings.
|
||
"""
|
||
# chunk the haystack
|
||
if granularity == "turn":
|
||
chunks = sessions_to_chunks_turn(
|
||
item["haystack_session_ids"], item["haystack_sessions"])
|
||
else:
|
||
chunks = sessions_to_chunks_session(
|
||
item["haystack_session_ids"], item["haystack_sessions"])
|
||
|
||
texts = [c[0] for c in chunks]
|
||
sids = [c[1] for c in chunks]
|
||
answer_sids = set(item["answer_session_ids"])
|
||
|
||
# batch embed all chunks
|
||
t0 = time.monotonic()
|
||
embeddings = embed_batch(encoder, texts)
|
||
embed_time = time.monotonic() - t0
|
||
|
||
# build memory
|
||
t1 = time.monotonic()
|
||
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
|
||
for i in range(len(chunks)):
|
||
hip.store(
|
||
embeddings[i], embeddings[i],
|
||
metadata={"session_id": sids[i]},
|
||
)
|
||
store_time = time.monotonic() - t1
|
||
|
||
# query
|
||
t2 = time.monotonic()
|
||
query_emb = encoder.encode(
|
||
[item["question"]], convert_to_tensor=True,
|
||
normalize_embeddings=True, device=DEVICE,
|
||
)[0]
|
||
|
||
max_k = max(ks)
|
||
results = hip.recall(query_emb, top_k=max_k)
|
||
recall_time = time.monotonic() - t2
|
||
|
||
# deduplicate by session_id, preserving rank order
|
||
seen = set()
|
||
ranked_sids = []
|
||
for r in results:
|
||
sid = r.metadata["session_id"]
|
||
if sid not in seen:
|
||
seen.add(sid)
|
||
ranked_sids.append(sid)
|
||
|
||
# compute metrics
|
||
metrics = {}
|
||
for k in ks:
|
||
top_k_sids = set(ranked_sids[:k])
|
||
hit = bool(answer_sids & top_k_sids)
|
||
metrics[f"R@{k}"] = 1.0 if hit else 0.0
|
||
|
||
# NDCG@10
|
||
ndcg = compute_ndcg(ranked_sids[:10], answer_sids)
|
||
metrics["NDCG@10"] = ndcg
|
||
|
||
metrics["embed_ms"] = embed_time * 1000
|
||
metrics["store_ms"] = store_time * 1000
|
||
metrics["recall_ms"] = recall_time * 1000
|
||
metrics["n_chunks"] = len(chunks)
|
||
|
||
return metrics
|
||
|
||
|
||
def compute_ndcg(ranked_sids, answer_sids, k=10):
|
||
"""Normalized Discounted Cumulative Gain."""
|
||
dcg = 0.0
|
||
for i, sid in enumerate(ranked_sids[:k]):
|
||
if sid in answer_sids:
|
||
dcg += 1.0 / math.log2(i + 2) # i+2 because rank starts at 1
|
||
|
||
# ideal: all answer sessions at top
|
||
n_relevant = min(len(answer_sids), k)
|
||
idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant))
|
||
|
||
return dcg / idcg if idcg > 0 else 0.0
|
||
|
||
|
||
# ── main ───<E29480><E29480>────────────────────────────────────────────────────────
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--data", default="benchmarks/longmemeval.json")
|
||
parser.add_argument("--limit", type=int, default=0, help="limit number of questions (0=all)")
|
||
parser.add_argument("--granularity", choices=["session", "turn"], default="turn")
|
||
args = parser.parse_args()
|
||
|
||
print(f"LongMemEval benchmark for nocmem")
|
||
print(f"granularity: {args.granularity}")
|
||
print(f"device: {DEVICE}")
|
||
print()
|
||
|
||
with open(args.data) as f:
|
||
data = json.load(f)
|
||
|
||
if args.limit:
|
||
data = data[:args.limit]
|
||
|
||
encoder = load_encoder()
|
||
|
||
print(f"evaluating {len(data)} questions...\n")
|
||
|
||
all_metrics = []
|
||
by_type = {}
|
||
|
||
for i, item in enumerate(data):
|
||
metrics = evaluate_question(encoder, item, args.granularity)
|
||
all_metrics.append(metrics)
|
||
|
||
qtype = item["question_type"]
|
||
if qtype not in by_type:
|
||
by_type[qtype] = []
|
||
by_type[qtype].append(metrics)
|
||
|
||
# progress
|
||
if (i + 1) % 10 == 0 or i == len(data) - 1:
|
||
r5 = sum(m["R@5"] for m in all_metrics) / len(all_metrics) * 100
|
||
r10 = sum(m["R@10"] for m in all_metrics) / len(all_metrics) * 100
|
||
avg_recall = sum(m["recall_ms"] for m in all_metrics) / len(all_metrics)
|
||
print(f" [{i+1:3d}/{len(data)}] R@5={r5:.1f}% R@10={r10:.1f}% recall={avg_recall:.1f}ms")
|
||
|
||
# final results
|
||
n = len(all_metrics)
|
||
r5 = sum(m["R@5"] for m in all_metrics) / n * 100
|
||
r10 = sum(m["R@10"] for m in all_metrics) / n * 100
|
||
ndcg = sum(m["NDCG@10"] for m in all_metrics) / n * 100
|
||
avg_embed = sum(m["embed_ms"] for m in all_metrics) / n
|
||
avg_store = sum(m["store_ms"] for m in all_metrics) / n
|
||
avg_recall = sum(m["recall_ms"] for m in all_metrics) / n
|
||
avg_chunks = sum(m["n_chunks"] for m in all_metrics) / n
|
||
|
||
print(f"\n{'='*60}")
|
||
print(f"nocmem LongMemEval Results ({args.granularity} granularity)")
|
||
print(f"{'='*60}")
|
||
print(f" Questions: {n}")
|
||
print(f" Avg chunks: {avg_chunks:.0f}")
|
||
print(f"")
|
||
print(f" R@5: {r5:.1f}%")
|
||
print(f" R@10: {r10:.1f}%")
|
||
print(f" NDCG@10: {ndcg:.1f}%")
|
||
print(f"")
|
||
print(f" Avg embed: {avg_embed:.0f}ms")
|
||
print(f" Avg store: {avg_store:.0f}ms")
|
||
print(f" Avg recall: {avg_recall:.1f}ms")
|
||
|
||
print(f"\n── by question type ──")
|
||
for qtype, ms in sorted(by_type.items()):
|
||
nt = len(ms)
|
||
tr5 = sum(m["R@5"] for m in ms) / nt * 100
|
||
tr10 = sum(m["R@10"] for m in ms) / nt * 100
|
||
print(f" {qtype:30s} n={nt:3d} R@5={tr5:.1f}% R@10={tr10:.1f}%")
|
||
|
||
print(f"\n── comparison ──")
|
||
print(f" MemPalace (raw, session): R@5=96.6%")
|
||
print(f" nocmem ({args.granularity:7s}): R@5={r5:.1f}%")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|