Files
noc/mem/benchmarks/efficiency_bench.py
Fam Zheng 7000ccda0f add nocmem: auto memory recall + ingest via NuoNuo hippocampal network
- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's
  Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints
- NOC integration: auto recall after user message (injected as system msg),
  async ingest after LLM response (fire-and-forget)
- Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32),
  top_k=3, KV-cache friendly (appended after user msg, not in system prompt)
- Ingest: LLM extraction + paraphrase augmentation, heuristic fallback
- Wired into main.rs, life.rs (agent done), http.rs (api chat)
- Config: optional `nocmem.endpoint` in config.yaml
- Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale
- Design doc: doc/nocmem.md
2026-04-11 12:24:48 +01:00

346 lines
12 KiB
Python

"""Efficiency benchmark for nocmem vs ChromaDB baseline.
Measures: storage size, memory usage, query latency, ingest throughput
at various scales (100, 1K, 5K, 10K, 20K memories).
Usage:
uv run python benchmarks/efficiency_bench.py
"""
import gc
import os
import json
import shutil
import tempfile
import time
import torch
import psutil
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBED_MODEL = "all-MiniLM-L6-v2"
EMBED_DIM = 384
DATA_FILE = "benchmarks/longmemeval.json"
# ── helpers ─────────────────────────────────────────────────────────
def get_process_mem_mb():
return psutil.Process(os.getpid()).memory_info().rss / 1024**2
def get_gpu_mem_mb():
if DEVICE != "cuda":
return 0.0
return torch.cuda.memory_allocated() / 1024**2
def file_size_mb(path):
if os.path.exists(path):
return os.path.getsize(path) / 1024**2
return 0.0
def dir_size_mb(path):
total = 0
for dirpath, _, filenames in os.walk(path):
for f in filenames:
total += os.path.getsize(os.path.join(dirpath, f))
return total / 1024**2
# ── extract chunks from LongMemEval ────────────────────────────────
def load_chunks(max_chunks=25000):
"""Extract turn-level chunks from LongMemEval data."""
with open(DATA_FILE) as f:
data = json.load(f)
chunks = []
seen = set()
for item in data:
for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
for i in range(0, len(sess) - 1, 2):
key = (sid, i)
if key in seen:
continue
seen.add(key)
user = sess[i]["content"]
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
text = f"{user}\n{asst}"[:1000]
chunks.append(text)
if len(chunks) >= max_chunks:
return chunks
return chunks
# ── nocmem benchmark ────────────────────────────────────────────────
def bench_nocmem(encoder, chunks, n, query_texts):
"""Benchmark nocmem at scale n."""
torch.cuda.empty_cache()
gc.collect()
subset = chunks[:n]
gpu_before = get_gpu_mem_mb()
ram_before = get_process_mem_mb()
# batch embed
t0 = time.monotonic()
embeddings = encoder.encode(
subset, convert_to_tensor=True, normalize_embeddings=True,
device=DEVICE, batch_size=256, show_progress_bar=False,
)
embed_time = time.monotonic() - t0
# store
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
t1 = time.monotonic()
for i in range(n):
hip.store(embeddings[i], embeddings[i], metadata={"id": i})
store_time = time.monotonic() - t1
gpu_after = get_gpu_mem_mb()
ram_after = get_process_mem_mb()
# save to measure file size
tmp = tempfile.mktemp(suffix=".pt")
hip.save(tmp)
disk_mb = file_size_mb(tmp)
os.unlink(tmp)
# query latency — multiple queries, measure p50/p99
query_embs = encoder.encode(
query_texts, convert_to_tensor=True, normalize_embeddings=True,
device=DEVICE, show_progress_bar=False,
)
latencies = []
for qe in query_embs:
t = time.monotonic()
hip.recall(qe, top_k=5)
latencies.append((time.monotonic() - t) * 1000)
latencies.sort()
p50 = latencies[len(latencies) // 2]
p99 = latencies[int(len(latencies) * 0.99)]
avg = sum(latencies) / len(latencies)
# cleanup
del hip, embeddings
torch.cuda.empty_cache()
return {
"n": n,
"embed_time_s": embed_time,
"store_time_s": store_time,
"ingest_rate": n / (embed_time + store_time), # memories/sec
"disk_mb": disk_mb,
"gpu_delta_mb": gpu_after - gpu_before,
"ram_delta_mb": ram_after - ram_before,
"latency_avg_ms": avg,
"latency_p50_ms": p50,
"latency_p99_ms": p99,
}
# ── chromadb benchmark ──────────────────────────────────────────────
def bench_chromadb(encoder, chunks, n, query_texts):
"""Benchmark ChromaDB (MemPalace's backend) at scale n."""
import chromadb
subset = chunks[:n]
ram_before = get_process_mem_mb()
tmpdir = tempfile.mkdtemp()
client = chromadb.PersistentClient(path=tmpdir)
collection = client.create_collection(
name="bench",
metadata={"hnsw:space": "cosine"},
)
# embed
t0 = time.monotonic()
embeddings_np = encoder.encode(
subset, normalize_embeddings=True,
batch_size=256, show_progress_bar=False,
)
embed_time = time.monotonic() - t0
# store — chromadb takes numpy/list
t1 = time.monotonic()
batch = 5000
for start in range(0, n, batch):
end = min(start + batch, n)
collection.add(
ids=[str(i) for i in range(start, end)],
embeddings=embeddings_np[start:end].tolist(),
documents=subset[start:end],
)
store_time = time.monotonic() - t1
ram_after = get_process_mem_mb()
disk_mb = dir_size_mb(tmpdir)
# query latency
query_np = encoder.encode(
query_texts, normalize_embeddings=True, show_progress_bar=False,
)
latencies = []
for qe in query_np:
t = time.monotonic()
collection.query(query_embeddings=[qe.tolist()], n_results=5)
latencies.append((time.monotonic() - t) * 1000)
latencies.sort()
p50 = latencies[len(latencies) // 2]
p99 = latencies[int(len(latencies) * 0.99)]
avg = sum(latencies) / len(latencies)
# cleanup
del client, collection
shutil.rmtree(tmpdir)
return {
"n": n,
"embed_time_s": embed_time,
"store_time_s": store_time,
"ingest_rate": n / (embed_time + store_time),
"disk_mb": disk_mb,
"gpu_delta_mb": 0,
"ram_delta_mb": ram_after - ram_before,
"latency_avg_ms": avg,
"latency_p50_ms": p50,
"latency_p99_ms": p99,
}
# ── main ────────────────────────────────────────────────────────────
def main():
print("nocmem efficiency benchmark")
print(f"device: {DEVICE}")
print()
# check chromadb available
has_chromadb = False
try:
import chromadb
has_chromadb = True
print("chromadb: available (will compare)")
except ImportError:
print("chromadb: not installed (nocmem only)")
print()
print("loading data...")
chunks = load_chunks(25000)
print(f" {len(chunks)} unique chunks extracted")
print("loading encoder...")
encoder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
# query texts — mix of English and Chinese
query_texts = [
"What degree did I graduate with?",
"How to deploy the application?",
"What was the database error we fixed last week?",
"Tell me about the meeting schedule",
"What programming language should I learn?",
"数据库密码在哪里",
"部署到生产环境的步骤",
"上次讨论的性能优化方案",
"项目的技术栈是什么",
"最近的待办事项有哪些",
"How do I configure the server?",
"What's the API endpoint for user authentication?",
"Can you recommend some books on machine learning?",
"What was the root cause of the production incident?",
"How much memory does the GPU have?",
"VR设备的兼容性问题",
"模型推理的延迟是多少",
"代码仓库的结构是怎样的",
"如何解决内存泄漏",
"上次会议的结论是什么",
]
scales = [100, 500, 1000, 5000, 10000, 20000]
# filter to what we have
scales = [s for s in scales if s <= len(chunks)]
nocmem_results = []
chroma_results = []
for n in scales:
print(f"\n── scale: {n:,} memories ──")
print(f" nocmem...", end="", flush=True)
r = bench_nocmem(encoder, chunks, n, query_texts)
nocmem_results.append(r)
print(f" done (R: {r['latency_avg_ms']:.1f}ms, disk: {r['disk_mb']:.1f}MB)")
if has_chromadb:
print(f" chromadb...", end="", flush=True)
r2 = bench_chromadb(encoder, chunks, n, query_texts)
chroma_results.append(r2)
print(f" done (R: {r2['latency_avg_ms']:.1f}ms, disk: {r2['disk_mb']:.1f}MB)")
# ── report ──────────────────────────────────────────────────────
print(f"\n{'='*80}")
print(f"EFFICIENCY BENCHMARK RESULTS")
print(f"{'='*80}")
# table header
if has_chromadb:
print(f"\n{'Scale':>8} | {'--- nocmem ---':^40} | {'--- ChromaDB ---':^40}")
print(f"{'':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Rate':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'RAM':>8} {'Rate':>8}")
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
print("-" * 100)
for nm, cr in zip(nocmem_results, chroma_results):
print(
f"{nm['n']:>8,} | "
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f} | "
f"{cr['latency_avg_ms']:>8.1f} {cr['latency_p99_ms']:>8.1f} {cr['disk_mb']:>8.1f} {cr['ram_delta_mb']:>8.1f} {cr['ingest_rate']:>8.0f}"
)
else:
print(f"\n{'Scale':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Ingest':>8}")
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
print("-" * 60)
for nm in nocmem_results:
print(
f"{nm['n']:>8,} | "
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f}"
)
# summary
if nocmem_results:
biggest = nocmem_results[-1]
print(f"\nnocmem @ {biggest['n']:,}:")
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
print(f" Disk: {biggest['disk_mb']:.1f} MB")
print(f" VRAM delta: {biggest['gpu_delta_mb']:.1f} MB")
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
if chroma_results:
biggest = chroma_results[-1]
print(f"\nChromaDB @ {biggest['n']:,}:")
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
print(f" Disk: {biggest['disk_mb']:.1f} MB")
print(f" RAM delta: {biggest['ram_delta_mb']:.1f} MB")
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
if has_chromadb and nocmem_results and chroma_results:
nm = nocmem_results[-1]
cr = chroma_results[-1]
print(f"\n── nocmem vs ChromaDB @ {nm['n']:,} ──")
lat_ratio = cr['latency_avg_ms'] / nm['latency_avg_ms'] if nm['latency_avg_ms'] > 0 else float('inf')
disk_ratio = cr['disk_mb'] / nm['disk_mb'] if nm['disk_mb'] > 0 else float('inf')
rate_ratio = nm['ingest_rate'] / cr['ingest_rate'] if cr['ingest_rate'] > 0 else float('inf')
print(f" Latency: nocmem {lat_ratio:.1f}x faster" if lat_ratio > 1 else f" Latency: ChromaDB {1/lat_ratio:.1f}x faster")
print(f" Disk: nocmem {disk_ratio:.1f}x smaller" if disk_ratio > 1 else f" Disk: ChromaDB {1/disk_ratio:.1f}x smaller")
print(f" Ingest: nocmem {rate_ratio:.1f}x faster" if rate_ratio > 1 else f" Ingest: ChromaDB {1/rate_ratio:.1f}x faster")
if __name__ == "__main__":
main()