"""Efficiency benchmark for nocmem vs ChromaDB baseline. Measures: storage size, memory usage, query latency, ingest throughput at various scales (100, 1K, 5K, 10K, 20K memories). Usage: uv run python benchmarks/efficiency_bench.py """ import gc import os import json import shutil import tempfile import time import torch import psutil from sentence_transformers import SentenceTransformer from nuonuo.hippocampus import HippocampalMemory DEVICE = "cuda" if torch.cuda.is_available() else "cpu" EMBED_MODEL = "all-MiniLM-L6-v2" EMBED_DIM = 384 DATA_FILE = "benchmarks/longmemeval.json" # ── helpers ───────────────────────────────────────────────────────── def get_process_mem_mb(): return psutil.Process(os.getpid()).memory_info().rss / 1024**2 def get_gpu_mem_mb(): if DEVICE != "cuda": return 0.0 return torch.cuda.memory_allocated() / 1024**2 def file_size_mb(path): if os.path.exists(path): return os.path.getsize(path) / 1024**2 return 0.0 def dir_size_mb(path): total = 0 for dirpath, _, filenames in os.walk(path): for f in filenames: total += os.path.getsize(os.path.join(dirpath, f)) return total / 1024**2 # ── extract chunks from LongMemEval ──────────────────────────────── def load_chunks(max_chunks=25000): """Extract turn-level chunks from LongMemEval data.""" with open(DATA_FILE) as f: data = json.load(f) chunks = [] seen = set() for item in data: for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]): for i in range(0, len(sess) - 1, 2): key = (sid, i) if key in seen: continue seen.add(key) user = sess[i]["content"] asst = sess[i + 1]["content"] if i + 1 < len(sess) else "" text = f"{user}\n{asst}"[:1000] chunks.append(text) if len(chunks) >= max_chunks: return chunks return chunks # ── nocmem benchmark ──────────────────────────────────────────────── def bench_nocmem(encoder, chunks, n, query_texts): """Benchmark nocmem at scale n.""" torch.cuda.empty_cache() gc.collect() subset = chunks[:n] gpu_before = get_gpu_mem_mb() ram_before = get_process_mem_mb() # batch embed t0 = time.monotonic() embeddings = encoder.encode( subset, convert_to_tensor=True, normalize_embeddings=True, device=DEVICE, batch_size=256, show_progress_bar=False, ) embed_time = time.monotonic() - t0 # store hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE) t1 = time.monotonic() for i in range(n): hip.store(embeddings[i], embeddings[i], metadata={"id": i}) store_time = time.monotonic() - t1 gpu_after = get_gpu_mem_mb() ram_after = get_process_mem_mb() # save to measure file size tmp = tempfile.mktemp(suffix=".pt") hip.save(tmp) disk_mb = file_size_mb(tmp) os.unlink(tmp) # query latency — multiple queries, measure p50/p99 query_embs = encoder.encode( query_texts, convert_to_tensor=True, normalize_embeddings=True, device=DEVICE, show_progress_bar=False, ) latencies = [] for qe in query_embs: t = time.monotonic() hip.recall(qe, top_k=5) latencies.append((time.monotonic() - t) * 1000) latencies.sort() p50 = latencies[len(latencies) // 2] p99 = latencies[int(len(latencies) * 0.99)] avg = sum(latencies) / len(latencies) # cleanup del hip, embeddings torch.cuda.empty_cache() return { "n": n, "embed_time_s": embed_time, "store_time_s": store_time, "ingest_rate": n / (embed_time + store_time), # memories/sec "disk_mb": disk_mb, "gpu_delta_mb": gpu_after - gpu_before, "ram_delta_mb": ram_after - ram_before, "latency_avg_ms": avg, "latency_p50_ms": p50, "latency_p99_ms": p99, } # ── chromadb benchmark ────────────────────────────────────────────── def bench_chromadb(encoder, chunks, n, query_texts): """Benchmark ChromaDB (MemPalace's backend) at scale n.""" import chromadb subset = chunks[:n] ram_before = get_process_mem_mb() tmpdir = tempfile.mkdtemp() client = chromadb.PersistentClient(path=tmpdir) collection = client.create_collection( name="bench", metadata={"hnsw:space": "cosine"}, ) # embed t0 = time.monotonic() embeddings_np = encoder.encode( subset, normalize_embeddings=True, batch_size=256, show_progress_bar=False, ) embed_time = time.monotonic() - t0 # store — chromadb takes numpy/list t1 = time.monotonic() batch = 5000 for start in range(0, n, batch): end = min(start + batch, n) collection.add( ids=[str(i) for i in range(start, end)], embeddings=embeddings_np[start:end].tolist(), documents=subset[start:end], ) store_time = time.monotonic() - t1 ram_after = get_process_mem_mb() disk_mb = dir_size_mb(tmpdir) # query latency query_np = encoder.encode( query_texts, normalize_embeddings=True, show_progress_bar=False, ) latencies = [] for qe in query_np: t = time.monotonic() collection.query(query_embeddings=[qe.tolist()], n_results=5) latencies.append((time.monotonic() - t) * 1000) latencies.sort() p50 = latencies[len(latencies) // 2] p99 = latencies[int(len(latencies) * 0.99)] avg = sum(latencies) / len(latencies) # cleanup del client, collection shutil.rmtree(tmpdir) return { "n": n, "embed_time_s": embed_time, "store_time_s": store_time, "ingest_rate": n / (embed_time + store_time), "disk_mb": disk_mb, "gpu_delta_mb": 0, "ram_delta_mb": ram_after - ram_before, "latency_avg_ms": avg, "latency_p50_ms": p50, "latency_p99_ms": p99, } # ── main ──────────────────────────────────────────────────────────── def main(): print("nocmem efficiency benchmark") print(f"device: {DEVICE}") print() # check chromadb available has_chromadb = False try: import chromadb has_chromadb = True print("chromadb: available (will compare)") except ImportError: print("chromadb: not installed (nocmem only)") print() print("loading data...") chunks = load_chunks(25000) print(f" {len(chunks)} unique chunks extracted") print("loading encoder...") encoder = SentenceTransformer(EMBED_MODEL, device=DEVICE) # query texts — mix of English and Chinese query_texts = [ "What degree did I graduate with?", "How to deploy the application?", "What was the database error we fixed last week?", "Tell me about the meeting schedule", "What programming language should I learn?", "数据库密码在哪里", "部署到生产环境的步骤", "上次讨论的性能优化方案", "项目的技术栈是什么", "最近的待办事项有哪些", "How do I configure the server?", "What's the API endpoint for user authentication?", "Can you recommend some books on machine learning?", "What was the root cause of the production incident?", "How much memory does the GPU have?", "VR设备的兼容性问题", "模型推理的延迟是多少", "代码仓库的结构是怎样的", "如何解决内存泄漏", "上次会议的结论是什么", ] scales = [100, 500, 1000, 5000, 10000, 20000] # filter to what we have scales = [s for s in scales if s <= len(chunks)] nocmem_results = [] chroma_results = [] for n in scales: print(f"\n── scale: {n:,} memories ──") print(f" nocmem...", end="", flush=True) r = bench_nocmem(encoder, chunks, n, query_texts) nocmem_results.append(r) print(f" done (R: {r['latency_avg_ms']:.1f}ms, disk: {r['disk_mb']:.1f}MB)") if has_chromadb: print(f" chromadb...", end="", flush=True) r2 = bench_chromadb(encoder, chunks, n, query_texts) chroma_results.append(r2) print(f" done (R: {r2['latency_avg_ms']:.1f}ms, disk: {r2['disk_mb']:.1f}MB)") # ── report ────────────────────────────────────────────────────── print(f"\n{'='*80}") print(f"EFFICIENCY BENCHMARK RESULTS") print(f"{'='*80}") # table header if has_chromadb: print(f"\n{'Scale':>8} | {'--- nocmem ---':^40} | {'--- ChromaDB ---':^40}") print(f"{'':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Rate':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'RAM':>8} {'Rate':>8}") print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}") print("-" * 100) for nm, cr in zip(nocmem_results, chroma_results): print( f"{nm['n']:>8,} | " f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f} | " f"{cr['latency_avg_ms']:>8.1f} {cr['latency_p99_ms']:>8.1f} {cr['disk_mb']:>8.1f} {cr['ram_delta_mb']:>8.1f} {cr['ingest_rate']:>8.0f}" ) else: print(f"\n{'Scale':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Ingest':>8}") print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}") print("-" * 60) for nm in nocmem_results: print( f"{nm['n']:>8,} | " f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f}" ) # summary if nocmem_results: biggest = nocmem_results[-1] print(f"\nnocmem @ {biggest['n']:,}:") print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms") print(f" Disk: {biggest['disk_mb']:.1f} MB") print(f" VRAM delta: {biggest['gpu_delta_mb']:.1f} MB") print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec") if chroma_results: biggest = chroma_results[-1] print(f"\nChromaDB @ {biggest['n']:,}:") print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms") print(f" Disk: {biggest['disk_mb']:.1f} MB") print(f" RAM delta: {biggest['ram_delta_mb']:.1f} MB") print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec") if has_chromadb and nocmem_results and chroma_results: nm = nocmem_results[-1] cr = chroma_results[-1] print(f"\n── nocmem vs ChromaDB @ {nm['n']:,} ──") lat_ratio = cr['latency_avg_ms'] / nm['latency_avg_ms'] if nm['latency_avg_ms'] > 0 else float('inf') disk_ratio = cr['disk_mb'] / nm['disk_mb'] if nm['disk_mb'] > 0 else float('inf') rate_ratio = nm['ingest_rate'] / cr['ingest_rate'] if cr['ingest_rate'] > 0 else float('inf') print(f" Latency: nocmem {lat_ratio:.1f}x faster" if lat_ratio > 1 else f" Latency: ChromaDB {1/lat_ratio:.1f}x faster") print(f" Disk: nocmem {disk_ratio:.1f}x smaller" if disk_ratio > 1 else f" Disk: ChromaDB {1/disk_ratio:.1f}x smaller") print(f" Ingest: nocmem {rate_ratio:.1f}x faster" if rate_ratio > 1 else f" Ingest: ChromaDB {1/rate_ratio:.1f}x faster") if __name__ == "__main__": main()