Files
noc/mem/benchmarks/prompt_experiment.py
Fam Zheng 35cafbd4ca nocmem: qa-style extraction prompt, multi-cue variants, claude history importer
- Switch extraction prompt to qa-style (80% recall vs 60% baseline)
- Semicolon-separated cues in extraction become paraphrase variants
- Add import_claude.py to bulk-import Claude Code conversation history
- Fix LLM model name in systemd service, add logging basicConfig
2026-04-11 22:57:17 +01:00

218 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Test different extraction prompts for ingest quality.
Takes a few real conversation turns, tries different prompts,
compares what gets extracted, then tests recall against target queries.
"""
import json
import time
import requests
import torch
from sentence_transformers import SentenceTransformer
from nuonuo.hippocampus import HippocampalMemory
from openai import OpenAI
DEVICE = "cuda"
LLM = OpenAI(base_url="http://100.84.7.49:8000/v1", api_key="unused")
MODEL = "gemma4-31b"
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
# real conversation turns from Claude Code history
TURNS = [
("装好了tailscale。你帮忙生成吧用新的bot名叫yoyo",
"Tailscale IP `100.116.213.25`yoyo 用户创建成功。继续搞 token 和部署。"),
("能来一个org共享的gitea ci cd runner 不,就跑在这台机器上就行。",
"搞起来!装 act_runner注册到 euphon org以后每个仓库直接加 `.gitea/workflows/` 就能跑 CI 了。"),
("能不能给heradev一个bridge的nicsecondary的这样就有192.168.1.0/24的地址可以用了",
"饭团,你是要给 heradev 这个 KVM 虚拟机加一个桥接网卡对吧?这样它就能直接拿到物理网段 192.168.1.0/24 的 IP。"),
("i7 bot也不好直接叫@ms吧",
"好,@ms 改好了。现在来研究全局 webhook 的问题——之前系统级 webhook 没触发,让我深入看看。"),
("AI/ML 工具克隆 全部删掉吧emblemcvvisint也删掉",
"好嘞,这一波删掉能省 ~62G。动手了。"),
]
# queries we expect to recall
TARGET_QUERIES = [
("yoyo是什么", ["yoyo"]),
("act runner怎么装的", ["act_runner", "runner"]),
("heradev网络怎么配的", ["heradev", "桥接", "bridge", "192.168"]),
("gitea bot叫什么", ["ms", "@ms", "麻薯"]),
("清理了哪些项目", ["emblemcv", "visint", "62G", ""]),
]
# different extraction prompts to test
PROMPTS = {
"baseline": """From this conversation turn, extract key facts worth remembering for future conversations.
For each fact, provide a "cue" (what would trigger recalling this) and a "target" (the fact itself).
Rate importance 0-1 (1 = critical fact, 0 = trivial).
User: {user}
Assistant: {assistant}
Output format (one per line):
CUE: <trigger phrase> | TARGET: <fact> | IMPORTANCE: <0-1>
Only extract genuinely useful facts. If nothing worth remembering, output NONE.""",
"entity_focused": """从这段对话中提取值得记住的事实。重点关注:
- 名称、代号、别名(谁叫什么)
- 配置、参数、端口、地址
- 做了什么操作、改了什么
- 决策和原因
每条事实用以下格式输出(每行一条):
CUE: <用什么问题能想起这件事> | TARGET: <事实本身,要具体> | IMPORTANCE: <0-1>
User: {user}
Assistant: {assistant}
如果没有值得记住的,输出 NONE。""",
"multi_cue": """从这段对话中提取值得长期记住的事实。
要求:
1. 每条事实提供 2-3 个不同的触发短语cue用分号分隔
2. target 要具体、独立可理解(不依赖上下文)
3. 包含所有出现的名称、代号、配置值
格式(每行一条):
CUE: <触发短语1>; <触发短语2>; <触发短语3> | TARGET: <具体事实> | IMPORTANCE: <0-1>
User: {user}
Assistant: {assistant}
没有值得记住的则输出 NONE。""",
"qa_style": """你是一个记忆提取器。把这段对话变成若干个"问答对"——未来有人问这个问题时,能直接给出答案。
要求:
- 问题要自然,像人真的会这么问
- 答案要具体完整,包含关键细节(名称、数字、地址等)
- 同一个事实可以从不同角度提问
格式(每行一条):
CUE: <自然的提问方式> | TARGET: <完整的回答> | IMPORTANCE: <0-1>
User: {user}
Assistant: {assistant}
没有值得记住的则输出 NONE。""",
}
import re
def extract_with_prompt(prompt_template, user_msg, asst_msg):
prompt = prompt_template.format(user=user_msg, assistant=asst_msg)
try:
resp = LLM.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.3, max_tokens=512,
)
result = resp.choices[0].message.content
except Exception as e:
return []
memories = []
for line in result.strip().split("\n"):
if line.strip() == "NONE":
break
m = re.match(r"CUE:\s*(.+?)\s*\|\s*TARGET:\s*(.+?)\s*\|\s*IMPORTANCE:\s*([\d.]+)", line)
if m:
memories.append({
"cue": m.group(1).strip(),
"target": m.group(2).strip(),
"importance": float(m.group(3)),
})
return memories
def emb(text):
return encoder.encode([text], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0]
def test_recall(memories_list, queries):
"""Build a memory from extracted memories and test recall."""
hip = HippocampalMemory(embed_dim=384, beta=32.0, hopfield_top_k=10, device=DEVICE)
for mem in memories_list:
cue_text = mem["cue"]
target_text = mem["target"]
cue_emb = emb(cue_text)
target_emb = emb(target_text)
# handle multi-cue (semicolon separated)
variants = []
if ";" in cue_text:
parts = [p.strip() for p in cue_text.split(";") if p.strip()]
if len(parts) > 1:
cue_emb = emb(parts[0])
variants = [emb(p) for p in parts[1:]]
hip.store(cue_emb, target_emb, cue_variants=variants if variants else None,
metadata={"cue": cue_text, "target": target_text})
hits = 0
for query, keywords in queries:
qe = emb(query)
results = hip.recall(qe, top_k=3)
recalled_text = " ".join(r.metadata["target"] for r in results)
hit = any(kw.lower() in recalled_text.lower() for kw in keywords)
if hit:
hits += 1
return hits, len(queries)
def main():
print("extraction prompt experiment\n")
print(f"turns: {len(TURNS)}, queries: {len(TARGET_QUERIES)}\n")
for name, template in PROMPTS.items():
print(f"{'='*60}")
print(f" prompt: {name}")
print(f"{'='*60}")
all_memories = []
for user_msg, asst_msg in TURNS:
mems = extract_with_prompt(template, user_msg, asst_msg)
all_memories.extend(mems)
for m in mems:
print(f" [{m['importance']:.1f}] CUE: {m['cue'][:50]}")
print(f" TGT: {m['target'][:60]}")
print(f"\n extracted: {len(all_memories)} memories")
hits, total = test_recall(all_memories, TARGET_QUERIES)
print(f" recall: {hits}/{total} ({hits/total*100:.0f}%)")
# show per-query results
hip = HippocampalMemory(embed_dim=384, beta=32.0, hopfield_top_k=10, device=DEVICE)
for mem in all_memories:
cue_text = mem["cue"]
cue_emb = emb(cue_text.split(";")[0].strip() if ";" in cue_text else cue_text)
target_emb = emb(mem["target"])
variants = []
if ";" in cue_text:
parts = [p.strip() for p in cue_text.split(";") if p.strip()]
variants = [emb(p) for p in parts[1:]] if len(parts) > 1 else []
hip.store(cue_emb, target_emb, cue_variants=variants or None,
metadata={"cue": cue_text, "target": mem["target"]})
for query, keywords in TARGET_QUERIES:
qe = emb(query)
results = hip.recall(qe, top_k=1)
if results:
target = results[0].metadata["target"][:60]
hit = any(kw.lower() in results[0].metadata["target"].lower() for kw in keywords)
mark = "" if hit else ""
print(f" {mark} {query:20s}{target}")
else:
print(f"{query:20s} → (empty)")
print()
if __name__ == "__main__":
main()