add nocmem: auto memory recall + ingest via NuoNuo hippocampal network
- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints - NOC integration: auto recall after user message (injected as system msg), async ingest after LLM response (fire-and-forget) - Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32), top_k=3, KV-cache friendly (appended after user msg, not in system prompt) - Ingest: LLM extraction + paraphrase augmentation, heuristic fallback - Wired into main.rs, life.rs (agent done), http.rs (api chat) - Config: optional `nocmem.endpoint` in config.yaml - Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale - Design doc: doc/nocmem.md
This commit is contained in:
345
mem/benchmarks/efficiency_bench.py
Normal file
345
mem/benchmarks/efficiency_bench.py
Normal file
@@ -0,0 +1,345 @@
|
||||
"""Efficiency benchmark for nocmem vs ChromaDB baseline.
|
||||
|
||||
Measures: storage size, memory usage, query latency, ingest throughput
|
||||
at various scales (100, 1K, 5K, 10K, 20K memories).
|
||||
|
||||
Usage:
|
||||
uv run python benchmarks/efficiency_bench.py
|
||||
"""
|
||||
|
||||
import gc
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import torch
|
||||
import psutil
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from nuonuo.hippocampus import HippocampalMemory
|
||||
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
EMBED_MODEL = "all-MiniLM-L6-v2"
|
||||
EMBED_DIM = 384
|
||||
|
||||
DATA_FILE = "benchmarks/longmemeval.json"
|
||||
|
||||
# ── helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
def get_process_mem_mb():
|
||||
return psutil.Process(os.getpid()).memory_info().rss / 1024**2
|
||||
|
||||
def get_gpu_mem_mb():
|
||||
if DEVICE != "cuda":
|
||||
return 0.0
|
||||
return torch.cuda.memory_allocated() / 1024**2
|
||||
|
||||
def file_size_mb(path):
|
||||
if os.path.exists(path):
|
||||
return os.path.getsize(path) / 1024**2
|
||||
return 0.0
|
||||
|
||||
def dir_size_mb(path):
|
||||
total = 0
|
||||
for dirpath, _, filenames in os.walk(path):
|
||||
for f in filenames:
|
||||
total += os.path.getsize(os.path.join(dirpath, f))
|
||||
return total / 1024**2
|
||||
|
||||
|
||||
# ── extract chunks from LongMemEval ────────────────────────────────
|
||||
|
||||
def load_chunks(max_chunks=25000):
|
||||
"""Extract turn-level chunks from LongMemEval data."""
|
||||
with open(DATA_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
chunks = []
|
||||
seen = set()
|
||||
for item in data:
|
||||
for sid, sess in zip(item["haystack_session_ids"], item["haystack_sessions"]):
|
||||
for i in range(0, len(sess) - 1, 2):
|
||||
key = (sid, i)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
user = sess[i]["content"]
|
||||
asst = sess[i + 1]["content"] if i + 1 < len(sess) else ""
|
||||
text = f"{user}\n{asst}"[:1000]
|
||||
chunks.append(text)
|
||||
if len(chunks) >= max_chunks:
|
||||
return chunks
|
||||
return chunks
|
||||
|
||||
|
||||
# ── nocmem benchmark ────────────────────────────────────────────────
|
||||
|
||||
def bench_nocmem(encoder, chunks, n, query_texts):
|
||||
"""Benchmark nocmem at scale n."""
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
subset = chunks[:n]
|
||||
gpu_before = get_gpu_mem_mb()
|
||||
ram_before = get_process_mem_mb()
|
||||
|
||||
# batch embed
|
||||
t0 = time.monotonic()
|
||||
embeddings = encoder.encode(
|
||||
subset, convert_to_tensor=True, normalize_embeddings=True,
|
||||
device=DEVICE, batch_size=256, show_progress_bar=False,
|
||||
)
|
||||
embed_time = time.monotonic() - t0
|
||||
|
||||
# store
|
||||
hip = HippocampalMemory(embed_dim=EMBED_DIM, device=DEVICE)
|
||||
t1 = time.monotonic()
|
||||
for i in range(n):
|
||||
hip.store(embeddings[i], embeddings[i], metadata={"id": i})
|
||||
store_time = time.monotonic() - t1
|
||||
|
||||
gpu_after = get_gpu_mem_mb()
|
||||
ram_after = get_process_mem_mb()
|
||||
|
||||
# save to measure file size
|
||||
tmp = tempfile.mktemp(suffix=".pt")
|
||||
hip.save(tmp)
|
||||
disk_mb = file_size_mb(tmp)
|
||||
os.unlink(tmp)
|
||||
|
||||
# query latency — multiple queries, measure p50/p99
|
||||
query_embs = encoder.encode(
|
||||
query_texts, convert_to_tensor=True, normalize_embeddings=True,
|
||||
device=DEVICE, show_progress_bar=False,
|
||||
)
|
||||
latencies = []
|
||||
for qe in query_embs:
|
||||
t = time.monotonic()
|
||||
hip.recall(qe, top_k=5)
|
||||
latencies.append((time.monotonic() - t) * 1000)
|
||||
|
||||
latencies.sort()
|
||||
p50 = latencies[len(latencies) // 2]
|
||||
p99 = latencies[int(len(latencies) * 0.99)]
|
||||
avg = sum(latencies) / len(latencies)
|
||||
|
||||
# cleanup
|
||||
del hip, embeddings
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return {
|
||||
"n": n,
|
||||
"embed_time_s": embed_time,
|
||||
"store_time_s": store_time,
|
||||
"ingest_rate": n / (embed_time + store_time), # memories/sec
|
||||
"disk_mb": disk_mb,
|
||||
"gpu_delta_mb": gpu_after - gpu_before,
|
||||
"ram_delta_mb": ram_after - ram_before,
|
||||
"latency_avg_ms": avg,
|
||||
"latency_p50_ms": p50,
|
||||
"latency_p99_ms": p99,
|
||||
}
|
||||
|
||||
|
||||
# ── chromadb benchmark ──────────────────────────────────────────────
|
||||
|
||||
def bench_chromadb(encoder, chunks, n, query_texts):
|
||||
"""Benchmark ChromaDB (MemPalace's backend) at scale n."""
|
||||
import chromadb
|
||||
|
||||
subset = chunks[:n]
|
||||
ram_before = get_process_mem_mb()
|
||||
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
client = chromadb.PersistentClient(path=tmpdir)
|
||||
collection = client.create_collection(
|
||||
name="bench",
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
|
||||
# embed
|
||||
t0 = time.monotonic()
|
||||
embeddings_np = encoder.encode(
|
||||
subset, normalize_embeddings=True,
|
||||
batch_size=256, show_progress_bar=False,
|
||||
)
|
||||
embed_time = time.monotonic() - t0
|
||||
|
||||
# store — chromadb takes numpy/list
|
||||
t1 = time.monotonic()
|
||||
batch = 5000
|
||||
for start in range(0, n, batch):
|
||||
end = min(start + batch, n)
|
||||
collection.add(
|
||||
ids=[str(i) for i in range(start, end)],
|
||||
embeddings=embeddings_np[start:end].tolist(),
|
||||
documents=subset[start:end],
|
||||
)
|
||||
store_time = time.monotonic() - t1
|
||||
|
||||
ram_after = get_process_mem_mb()
|
||||
disk_mb = dir_size_mb(tmpdir)
|
||||
|
||||
# query latency
|
||||
query_np = encoder.encode(
|
||||
query_texts, normalize_embeddings=True, show_progress_bar=False,
|
||||
)
|
||||
latencies = []
|
||||
for qe in query_np:
|
||||
t = time.monotonic()
|
||||
collection.query(query_embeddings=[qe.tolist()], n_results=5)
|
||||
latencies.append((time.monotonic() - t) * 1000)
|
||||
|
||||
latencies.sort()
|
||||
p50 = latencies[len(latencies) // 2]
|
||||
p99 = latencies[int(len(latencies) * 0.99)]
|
||||
avg = sum(latencies) / len(latencies)
|
||||
|
||||
# cleanup
|
||||
del client, collection
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
return {
|
||||
"n": n,
|
||||
"embed_time_s": embed_time,
|
||||
"store_time_s": store_time,
|
||||
"ingest_rate": n / (embed_time + store_time),
|
||||
"disk_mb": disk_mb,
|
||||
"gpu_delta_mb": 0,
|
||||
"ram_delta_mb": ram_after - ram_before,
|
||||
"latency_avg_ms": avg,
|
||||
"latency_p50_ms": p50,
|
||||
"latency_p99_ms": p99,
|
||||
}
|
||||
|
||||
|
||||
# ── main ────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
print("nocmem efficiency benchmark")
|
||||
print(f"device: {DEVICE}")
|
||||
print()
|
||||
|
||||
# check chromadb available
|
||||
has_chromadb = False
|
||||
try:
|
||||
import chromadb
|
||||
has_chromadb = True
|
||||
print("chromadb: available (will compare)")
|
||||
except ImportError:
|
||||
print("chromadb: not installed (nocmem only)")
|
||||
print()
|
||||
|
||||
print("loading data...")
|
||||
chunks = load_chunks(25000)
|
||||
print(f" {len(chunks)} unique chunks extracted")
|
||||
|
||||
print("loading encoder...")
|
||||
encoder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
|
||||
|
||||
# query texts — mix of English and Chinese
|
||||
query_texts = [
|
||||
"What degree did I graduate with?",
|
||||
"How to deploy the application?",
|
||||
"What was the database error we fixed last week?",
|
||||
"Tell me about the meeting schedule",
|
||||
"What programming language should I learn?",
|
||||
"数据库密码在哪里",
|
||||
"部署到生产环境的步骤",
|
||||
"上次讨论的性能优化方案",
|
||||
"项目的技术栈是什么",
|
||||
"最近的待办事项有哪些",
|
||||
"How do I configure the server?",
|
||||
"What's the API endpoint for user authentication?",
|
||||
"Can you recommend some books on machine learning?",
|
||||
"What was the root cause of the production incident?",
|
||||
"How much memory does the GPU have?",
|
||||
"VR设备的兼容性问题",
|
||||
"模型推理的延迟是多少",
|
||||
"代码仓库的结构是怎样的",
|
||||
"如何解决内存泄漏",
|
||||
"上次会议的结论是什么",
|
||||
]
|
||||
|
||||
scales = [100, 500, 1000, 5000, 10000, 20000]
|
||||
# filter to what we have
|
||||
scales = [s for s in scales if s <= len(chunks)]
|
||||
|
||||
nocmem_results = []
|
||||
chroma_results = []
|
||||
|
||||
for n in scales:
|
||||
print(f"\n── scale: {n:,} memories ──")
|
||||
|
||||
print(f" nocmem...", end="", flush=True)
|
||||
r = bench_nocmem(encoder, chunks, n, query_texts)
|
||||
nocmem_results.append(r)
|
||||
print(f" done (R: {r['latency_avg_ms']:.1f}ms, disk: {r['disk_mb']:.1f}MB)")
|
||||
|
||||
if has_chromadb:
|
||||
print(f" chromadb...", end="", flush=True)
|
||||
r2 = bench_chromadb(encoder, chunks, n, query_texts)
|
||||
chroma_results.append(r2)
|
||||
print(f" done (R: {r2['latency_avg_ms']:.1f}ms, disk: {r2['disk_mb']:.1f}MB)")
|
||||
|
||||
# ── report ──────────────────────────────────────────────────────
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"EFFICIENCY BENCHMARK RESULTS")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# table header
|
||||
if has_chromadb:
|
||||
print(f"\n{'Scale':>8} | {'--- nocmem ---':^40} | {'--- ChromaDB ---':^40}")
|
||||
print(f"{'':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Rate':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'RAM':>8} {'Rate':>8}")
|
||||
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
|
||||
print("-" * 100)
|
||||
for nm, cr in zip(nocmem_results, chroma_results):
|
||||
print(
|
||||
f"{nm['n']:>8,} | "
|
||||
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f} | "
|
||||
f"{cr['latency_avg_ms']:>8.1f} {cr['latency_p99_ms']:>8.1f} {cr['disk_mb']:>8.1f} {cr['ram_delta_mb']:>8.1f} {cr['ingest_rate']:>8.0f}"
|
||||
)
|
||||
else:
|
||||
print(f"\n{'Scale':>8} | {'Latency':>8} {'p99':>8} {'Disk':>8} {'VRAM':>8} {'Ingest':>8}")
|
||||
print(f"{'':>8} | {'(ms)':>8} {'(ms)':>8} {'(MB)':>8} {'(MB)':>8} {'(/s)':>8}")
|
||||
print("-" * 60)
|
||||
for nm in nocmem_results:
|
||||
print(
|
||||
f"{nm['n']:>8,} | "
|
||||
f"{nm['latency_avg_ms']:>8.1f} {nm['latency_p99_ms']:>8.1f} {nm['disk_mb']:>8.1f} {nm['gpu_delta_mb']:>8.1f} {nm['ingest_rate']:>8.0f}"
|
||||
)
|
||||
|
||||
# summary
|
||||
if nocmem_results:
|
||||
biggest = nocmem_results[-1]
|
||||
print(f"\nnocmem @ {biggest['n']:,}:")
|
||||
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
|
||||
print(f" Disk: {biggest['disk_mb']:.1f} MB")
|
||||
print(f" VRAM delta: {biggest['gpu_delta_mb']:.1f} MB")
|
||||
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
|
||||
|
||||
if chroma_results:
|
||||
biggest = chroma_results[-1]
|
||||
print(f"\nChromaDB @ {biggest['n']:,}:")
|
||||
print(f" Query latency: avg {biggest['latency_avg_ms']:.1f}ms, p99 {biggest['latency_p99_ms']:.1f}ms")
|
||||
print(f" Disk: {biggest['disk_mb']:.1f} MB")
|
||||
print(f" RAM delta: {biggest['ram_delta_mb']:.1f} MB")
|
||||
print(f" Ingest rate: {biggest['ingest_rate']:.0f} memories/sec")
|
||||
|
||||
if has_chromadb and nocmem_results and chroma_results:
|
||||
nm = nocmem_results[-1]
|
||||
cr = chroma_results[-1]
|
||||
print(f"\n── nocmem vs ChromaDB @ {nm['n']:,} ──")
|
||||
lat_ratio = cr['latency_avg_ms'] / nm['latency_avg_ms'] if nm['latency_avg_ms'] > 0 else float('inf')
|
||||
disk_ratio = cr['disk_mb'] / nm['disk_mb'] if nm['disk_mb'] > 0 else float('inf')
|
||||
rate_ratio = nm['ingest_rate'] / cr['ingest_rate'] if cr['ingest_rate'] > 0 else float('inf')
|
||||
print(f" Latency: nocmem {lat_ratio:.1f}x faster" if lat_ratio > 1 else f" Latency: ChromaDB {1/lat_ratio:.1f}x faster")
|
||||
print(f" Disk: nocmem {disk_ratio:.1f}x smaller" if disk_ratio > 1 else f" Disk: ChromaDB {1/disk_ratio:.1f}x smaller")
|
||||
print(f" Ingest: nocmem {rate_ratio:.1f}x faster" if rate_ratio > 1 else f" Ingest: ChromaDB {1/rate_ratio:.1f}x faster")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user