- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints - NOC integration: auto recall after user message (injected as system msg), async ingest after LLM response (fire-and-forget) - Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32), top_k=3, KV-cache friendly (appended after user msg, not in system prompt) - Ingest: LLM extraction + paraphrase augmentation, heuristic fallback - Wired into main.rs, life.rs (agent done), http.rs (api chat) - Config: optional `nocmem.endpoint` in config.yaml - Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale - Design doc: doc/nocmem.md
346 lines
12 KiB
Python
346 lines
12 KiB
Python
"""Efficiency benchmark for nocmem vs ChromaDB baseline.
|
|
|
|
Measures: storage size, memory usage, query latency, ingest throughput
|
|
at various scales (100, 1K, 5K, 10K, 20K memories).
|
|
|
|
Usage:
|
|
uv run python benchmarks/efficiency_bench.py
|
|
"""
|
|
|
|
import gc
|
|
import os
|
|
import json
|
|
import shutil
|
|
import tempfile
|
|
import time
|
|
|
|
import torch
|
|
import psutil
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
from nuonuo.hippocampus import HippocampalMemory
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
EMBED_MODEL = "all-MiniLM-L6-v2"
|
|
EMBED_DIM = 384
|
|
|
|
DATA_FILE = "benchmarks/longmemeval.json"
|
|
|
|
# ── helpers ─────────────────────────────────────────────────────────
|
|
|
|
def get_process_mem_mb():
|
|
return psutil.Process(os.getpid()).memory_info().rss / 1024**2
|
|
|
|
def get_gpu_mem_mb():
|
|
if DEVICE != "cuda":
|
|
return 0.0
|
|
return torch.cuda.memory_allocated() / 1024**2
|
|
|
|
def file_size_mb(path):
|
|
if os.path.exists(path):
|
|
return os.path.getsize(path) / 1024**2
|
|
return 0.0
|
|
|
|
def dir_size_mb(path):
|
|
total = 0
|
|
for dirpath, _, filenames in os.walk(path):
|
|
for f in filenames:
|
|
total += os.path.getsize(os.path.join(dirpath, f))
|
|
return total / 1024**2
|
|
|
|
|
|
# ── extract chunks from LongMemEval ────────────────────────────────
|
|
|
|
def load_chunks(max_chunks=25000):
|
|
"""Extract turn-level chunks from LongMemEval data."""
|
|
with open(DATA_FILE) as f:
|
|
data = json.load(f)
|
|
|
|
chunks = []
|
|
seen = set()
|
|
for item in data:
|
|
for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
|
|
for i in range(0, len(sess) - 1, 2):
|
|
key = (sid, i)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
user = sess[i]["content"]
|
|
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
|
|
text = f"{user}\n{asst}"[:1000]
|
|
chunks.append(text)
|
|
if len(chunks) >= max_chunks:
|
|
return chunks
|
|
return chunks
|
|
|
|
|
|
# ── nocmem benchmark ────────────────────────────────────────────────
|
|
|
|
def bench_nocmem(encoder, chunks, n, query_texts):
|
|
"""Benchmark nocmem at scale n."""
|
|
torch.cuda.empty_cache()
|
|
gc.collect()
|
|
|
|
subset = chunks[:n]
|
|
gpu_before = get_gpu_mem_mb()
|
|
ram_before = get_process_mem_mb()
|
|
|
|
# batch embed
|
|
t0 = time.monotonic()
|
|
embeddings = encoder.encode(
|
|
subset, convert_to_tensor=True, normalize_embeddings=True,
|
|
device=DEVICE, batch_size=256, show_progress_bar=False,
|
|
)
|
|
embed_time = time.monotonic() - t0
|
|
|
|
# store
|
|
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
|
|
t1 = time.monotonic()
|
|
for i in range(n):
|
|
hip.store(embeddings[i], embeddings[i], metadata={"id": i})
|
|
store_time = time.monotonic() - t1
|
|
|
|
gpu_after = get_gpu_mem_mb()
|
|
ram_after = get_process_mem_mb()
|
|
|
|
# save to measure file size
|
|
tmp = tempfile.mktemp(suffix=".pt")
|
|
hip.save(tmp)
|
|
disk_mb = file_size_mb(tmp)
|
|
os.unlink(tmp)
|
|
|
|
# query latency — multiple queries, measure p50/p99
|
|
query_embs = encoder.encode(
|
|
query_texts, convert_to_tensor=True, normalize_embeddings=True,
|
|
device=DEVICE, show_progress_bar=False,
|
|
)
|
|
latencies = []
|
|
for qe in query_embs:
|
|
t = time.monotonic()
|
|
hip.recall(qe, top_k=5)
|
|
latencies.append((time.monotonic() - t) * 1000)
|
|
|
|
latencies.sort()
|
|
p50 = latencies[len(latencies) // 2]
|
|
p99 = latencies[int(len(latencies) * 0.99)]
|
|
avg = sum(latencies) / len(latencies)
|
|
|
|
# cleanup
|
|
del hip, embeddings
|
|
torch.cuda.empty_cache()
|
|
|
|
return {
|
|
"n": n,
|
|
"embed_time_s": embed_time,
|
|
"store_time_s": store_time,
|
|
"ingest_rate": n / (embed_time + store_time), # memories/sec
|
|
"disk_mb": disk_mb,
|
|
"gpu_delta_mb": gpu_after - gpu_before,
|
|
"ram_delta_mb": ram_after - ram_before,
|
|
"latency_avg_ms": avg,
|
|
"latency_p50_ms": p50,
|
|
"latency_p99_ms": p99,
|
|
}
|
|
|
|
|
|
# ── chromadb benchmark ──────────────────────────────────────────────
|
|
|
|
def bench_chromadb(encoder, chunks, n, query_texts):
|
|
"""Benchmark ChromaDB (MemPalace's backend) at scale n."""
|
|
import chromadb
|
|
|
|
subset = chunks[:n]
|
|
ram_before = get_process_mem_mb()
|
|
|
|
tmpdir = tempfile.mkdtemp()
|
|
client = chromadb.PersistentClient(path=tmpdir)
|
|
collection = client.create_collection(
|
|
name="bench",
|
|
metadata={"hnsw:space": "cosine"},
|
|
)
|
|
|
|
# embed
|
|
t0 = time.monotonic()
|
|
embeddings_np = encoder.encode(
|
|
subset, normalize_embeddings=True,
|
|
batch_size=256, show_progress_bar=False,
|
|
)
|
|
embed_time = time.monotonic() - t0
|
|
|
|
# store — chromadb takes numpy/list
|
|
t1 = time.monotonic()
|
|
batch = 5000
|
|
for start in range(0, n, batch):
|
|
end = min(start + batch, n)
|
|
collection.add(
|
|
ids=[str(i) for i in range(start, end)],
|
|
embeddings=embeddings_np[start:end].tolist(),
|
|
documents=subset[start:end],
|
|
)
|
|
store_time = time.monotonic() - t1
|
|
|
|
ram_after = get_process_mem_mb()
|
|
disk_mb = dir_size_mb(tmpdir)
|
|
|
|
# query latency
|
|
query_np = encoder.encode(
|
|
query_texts, normalize_embeddings=True, show_progress_bar=False,
|
|
)
|
|
latencies = []
|
|
for qe in query_np:
|
|
t = time.monotonic()
|
|
collection.query(query_embeddings=[qe.tolist()], n_results=5)
|
|
latencies.append((time.monotonic() - t) * 1000)
|
|
|
|
latencies.sort()
|
|
p50 = latencies[len(latencies) // 2]
|
|
p99 = latencies[int(len(latencies) * 0.99)]
|
|
avg = sum(latencies) / len(latencies)
|
|
|
|
# cleanup
|
|
del client, collection
|
|
shutil.rmtree(tmpdir)
|
|
|
|
return {
|
|
"n": n,
|
|
"embed_time_s": embed_time,
|
|
"store_time_s": store_time,
|
|
"ingest_rate": n / (embed_time + store_time),
|
|
"disk_mb": disk_mb,
|
|
"gpu_delta_mb": 0,
|
|
"ram_delta_mb": ram_after - ram_before,
|
|
"latency_avg_ms": avg,
|
|
"latency_p50_ms": p50,
|
|
"latency_p99_ms": p99,
|
|
}
|
|
|
|
|
|
# ── main ────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
print("nocmem efficiency benchmark")
|
|
print(f"device: {DEVICE}")
|
|
print()
|
|
|
|
# check chromadb available
|
|
has_chromadb = False
|
|
try:
|
|
import chromadb
|
|
has_chromadb = True
|
|
print("chromadb: available (will compare)")
|
|
except ImportError:
|
|
print("chromadb: not installed (nocmem only)")
|
|
print()
|
|
|
|
print("loading data...")
|
|
chunks = load_chunks(25000)
|
|
print(f" {len(chunks)} unique chunks extracted")
|
|
|
|
print("loading encoder...")
|
|
encoder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
|
|
|
|
# query texts — mix of English and Chinese
|
|
query_texts = [
|
|
"What degree did I graduate with?",
|
|
"How to deploy the application?",
|
|
"What was the database error we fixed last week?",
|
|
"Tell me about the meeting schedule",
|
|
"What programming language should I learn?",
|
|
"数据库密码在哪里",
|
|
"部署到生产环境的步骤",
|
|
"上次讨论的性能优化方案",
|
|
"项目的技术栈是什么",
|
|
"最近的待办事项有哪些",
|
|
"How do I configure the server?",
|
|
"What's the API endpoint for user authentication?",
|
|
"Can you recommend some books on machine learning?",
|
|
"What was the root cause of the production incident?",
|
|
"How much memory does the GPU have?",
|
|
"VR设备的兼容性问题",
|
|
"模型推理的延迟是多少",
|
|
"代码仓库的结构是怎样的",
|
|
"如何解决内存泄漏",
|
|
"上次会议的结论是什么",
|
|
]
|
|
|
|
scales = [100, 500, 1000, 5000, 10000, 20000]
|
|
# filter to what we have
|
|
scales = [s for s in scales if s <= len(chunks)]
|
|
|
|
nocmem_results = []
|
|
chroma_results = []
|
|
|
|
for n in scales:
|
|
print(f"\n── scale: {n:,} memories ──")
|
|
|
|
print(f" nocmem...", end="", flush=True)
|
|
r = bench_nocmem(encoder, chunks, n, query_texts)
|
|
nocmem_results.append(r)
|
|
print(f" done (R: {r['latency_avg_ms']:.1f}ms, disk: {r['disk_mb']:.1f}MB)")
|
|
|
|
if has_chromadb:
|
|
print(f" chromadb...", end="", flush=True)
|
|
r2 = bench_chromadb(encoder, chunks, n, query_texts)
|
|
chroma_results.append(r2)
|
|
print(f" done (R: {r2['latency_avg_ms']:.1f}ms, disk: {r2['disk_mb']:.1f}MB)")
|
|
|
|
# ── report ──────────────────────────────────────────────────────
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"EFFICIENCY BENCHMARK RESULTS")
|
|
print(f"{'='*80}")
|
|
|
|
# table header
|
|
if has_chromadb:
|
|
print(f"\n{'Scale':>8} | {'--- nocmem ---':^40} | {'--- ChromaDB ---':^40}")
|
|
print(f"{'':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Rate':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'RAM':>8} {'Rate':>8}")
|
|
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
|
|
print("-" * 100)
|
|
for nm, cr in zip(nocmem_results, chroma_results):
|
|
print(
|
|
f"{nm['n']:>8,} | "
|
|
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f} | "
|
|
f"{cr['latency_avg_ms']:>8.1f} {cr['latency_p99_ms']:>8.1f} {cr['disk_mb']:>8.1f} {cr['ram_delta_mb']:>8.1f} {cr['ingest_rate']:>8.0f}"
|
|
)
|
|
else:
|
|
print(f"\n{'Scale':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Ingest':>8}")
|
|
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
|
|
print("-" * 60)
|
|
for nm in nocmem_results:
|
|
print(
|
|
f"{nm['n']:>8,} | "
|
|
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f}"
|
|
)
|
|
|
|
# summary
|
|
if nocmem_results:
|
|
biggest = nocmem_results[-1]
|
|
print(f"\nnocmem @ {biggest['n']:,}:")
|
|
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
|
|
print(f" Disk: {biggest['disk_mb']:.1f} MB")
|
|
print(f" VRAM delta: {biggest['gpu_delta_mb']:.1f} MB")
|
|
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
|
|
|
|
if chroma_results:
|
|
biggest = chroma_results[-1]
|
|
print(f"\nChromaDB @ {biggest['n']:,}:")
|
|
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
|
|
print(f" Disk: {biggest['disk_mb']:.1f} MB")
|
|
print(f" RAM delta: {biggest['ram_delta_mb']:.1f} MB")
|
|
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
|
|
|
|
if has_chromadb and nocmem_results and chroma_results:
|
|
nm = nocmem_results[-1]
|
|
cr = chroma_results[-1]
|
|
print(f"\n── nocmem vs ChromaDB @ {nm['n']:,} ──")
|
|
lat_ratio = cr['latency_avg_ms'] / nm['latency_avg_ms'] if nm['latency_avg_ms'] > 0 else float('inf')
|
|
disk_ratio = cr['disk_mb'] / nm['disk_mb'] if nm['disk_mb'] > 0 else float('inf')
|
|
rate_ratio = nm['ingest_rate'] / cr['ingest_rate'] if cr['ingest_rate'] > 0 else float('inf')
|
|
print(f" Latency: nocmem {lat_ratio:.1f}x faster" if lat_ratio > 1 else f" Latency: ChromaDB {1/lat_ratio:.1f}x faster")
|
|
print(f" Disk: nocmem {disk_ratio:.1f}x smaller" if disk_ratio > 1 else f" Disk: ChromaDB {1/disk_ratio:.1f}x smaller")
|
|
print(f" Ingest: nocmem {rate_ratio:.1f}x faster" if rate_ratio > 1 else f" Ingest: ChromaDB {1/rate_ratio:.1f}x faster")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|