add nocmem: auto memory recall + ingest via NuoNuo hippocampal network

- nocmem Python service (mem/): FastAPI wrapper around NuoNuo's
  Hopfield-Hebbian memory, with /recall, /ingest, /store, /stats endpoints
- NOC integration: auto recall after user message (injected as system msg),
  async ingest after LLM response (fire-and-forget)
- Recall: cosine pre-filter (threshold 0.35) + Hopfield attention (β=32),
  top_k=3, KV-cache friendly (appended after user msg, not in system prompt)
- Ingest: LLM extraction + paraphrase augmentation, heuristic fallback
- Wired into main.rs, life.rs (agent done), http.rs (api chat)
- Config: optional `nocmem.endpoint` in config.yaml
- Includes benchmarks: LongMemEval (R@5=94.0%), efficiency, noise vs scale
- Design doc: doc/nocmem.md
This commit is contained in:
Fam Zheng
2026-04-11 12:24:48 +01:00
parent 688387dac3
commit 7000ccda0f
17 changed files with 4164 additions and 3 deletions

View File

@@ -0,0 +1,104 @@
"""Test Hopfield attention sharpness with different top_k and beta.
Goal: find settings that give "either clearly remembered or nothing"
instead of flat attention across 20 candidates.
"""
import torch
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
DEVICE = "cuda"
EMBED_DIM = 384
print("loading encoder...")
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
def emb(text):
return encoder.encode([text], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0]
# store the same memories in each config
MEMORIES = [
("bot的名字叫什么", "bot的名字叫小乖是Fam给取的"),
("有哪些工具可以用", "工具有: fam_todo, send_file, spawn_agent, run_shell, run_python, update_memory"),
("vLLM在5090上的性能", "RTX 5090上vLLM跑gemma只有4.8 tok/s需要切换到awq_marlin"),
("repo-vis项目是什么", "repo-vis用Rust后端+Three.js前端的3D代码库可视化目标支持Linux内核和Pico VR"),
("repo-vis的性能瓶颈", "Linux内核79K文件SQLite 1GB上限和O(n)反序列化是瓶颈需要n-ary tree按需合并"),
("明天的待办事项", "最紧迫的是emblem scanner的AI Chat和KB部分"),
("后端切换到了什么", "NOC后端切换到了vLLM速度变快了"),
("数据库密码在哪里", "数据库密码存在 /etc/secrets/db.env 文件中"),
("什么GPU", "服务器有NVIDIA RTX 4090 24GB VRAM"),
("home有多少log文件", "home目录及子目录下共有960个.log文件"),
]
QUERIES = [
("repo-vis怎么样了", "repo-vis", True), # should recall clearly
("数据库密码", "密码", True), # should recall clearly
("今天天气怎么样", "天气", False), # irrelevant, should recall nothing
("vllm速度", "vllm", True), # should recall clearly
("你喜欢吃什么", "吃什么", False), # irrelevant
("VR支持", "VR", True), # edge case
]
CONFIGS = [
# (top_k, beta, label)
(20, 16.0, "baseline (top_k=20, β=16)"),
(10, 16.0, "top_k=10, β=16"),
(5, 16.0, "top_k=5, β=16"),
(20, 32.0, "top_k=20, β=32"),
(20, 64.0, "top_k=20, β=64"),
(10, 32.0, "top_k=10, β=32"),
(5, 32.0, "top_k=5, β=32"),
(5, 64.0, "top_k=5, β=64"),
]
# pre-embed everything
mem_embs = [(emb(c), emb(t), c, t) for c, t in MEMORIES]
query_embs = [(emb(q), label, relevant) for q, label, relevant in QUERIES]
print(f"\n{len(MEMORIES)} memories, {len(QUERIES)} queries, {len(CONFIGS)} configs\n")
for top_k, beta, label in CONFIGS:
print(f"{'='*70}")
print(f" {label}")
print(f"{'='*70}")
hip = HippocampalMemory(
embed_dim=EMBED_DIM, hopfield_top_k=top_k, beta=beta, device=DEVICE,
)
for ce, te, cue_text, target_text in mem_embs:
hip.store(ce, te, metadata={"cue": cue_text, "target": target_text})
for qe, qlabel, should_recall in query_embs:
results = hip.recall(qe, top_k=5)
# show distribution
sims = [r.similarity for r in results]
top1 = sims[0] if sims else 0
top2 = sims[1] if len(sims) > 1 else 0
gap = top1 - top2 # gap between #1 and #2
above_5pct = sum(1 for s in sims if s >= 0.05)
above_10pct = sum(1 for s in sims if s >= 0.10)
top_target = results[0].metadata["target"][:40] if results else ""
tag = "" if should_recall else ""
print(f" [{tag}] {qlabel:10s} top1={top1:.0%} top2={top2:.0%} gap={gap:.0%} "
f"≥5%:{above_5pct} ≥10%:{above_10pct}{top_target}")
# summary: average sharpness
total_gap = 0
total_top1 = 0
for qe, qlabel, _ in query_embs:
results = hip.recall(qe, top_k=5)
sims = [r.similarity for r in results]
total_top1 += sims[0] if sims else 0
total_gap += (sims[0] - sims[1]) if len(sims) > 1 else 0
n = len(query_embs)
print(f"\n avg top1={total_top1/n:.0%} avg gap={total_gap/n:.0%}")
print()
del hip
torch.cuda.empty_cache()