"""Test different extraction prompts for ingest quality. Takes a few real conversation turns, tries different prompts, compares what gets extracted, then tests recall against target queries. """ import json import time import requests import torch from sentence_transformers import SentenceTransformer from nuonuo.hippocampus import HippocampalMemory from openai import OpenAI DEVICE = "cuda" LLM = OpenAI(base_url="http://100.84.7.49:8000/v1", api_key="unused") MODEL = "gemma4-31b" encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE) # real conversation turns from Claude Code history TURNS = [ ("装好了tailscale。你帮忙生成吧,用新的bot名叫yoyo", "好,Tailscale IP `100.116.213.25`,yoyo 用户创建成功。继续搞 token 和部署。"), ("能来一个org共享的gitea ci cd runner 不,就跑在这台机器上就行。", "搞起来!装 act_runner,注册到 euphon org,以后每个仓库直接加 `.gitea/workflows/` 就能跑 CI 了。"), ("能不能给heradev一个bridge的nic,secondary的,这样就有192.168.1.0/24的地址可以用了", "饭团,你是要给 heradev 这个 KVM 虚拟机加一个桥接网卡对吧?这样它就能直接拿到物理网段 192.168.1.0/24 的 IP。"), ("i7 bot也不好,直接叫@ms吧", "好,@ms 改好了。现在来研究全局 webhook 的问题——之前系统级 webhook 没触发,让我深入看看。"), ("AI/ML 工具克隆 全部删掉吧,emblemcv,visint也删掉", "好嘞,这一波删掉能省 ~62G。动手了。"), ] # queries we expect to recall TARGET_QUERIES = [ ("yoyo是什么", ["yoyo"]), ("act runner怎么装的", ["act_runner", "runner"]), ("heradev网络怎么配的", ["heradev", "桥接", "bridge", "192.168"]), ("gitea bot叫什么", ["ms", "@ms", "麻薯"]), ("清理了哪些项目", ["emblemcv", "visint", "62G", "删"]), ] # different extraction prompts to test PROMPTS = { "baseline": """From this conversation turn, extract key facts worth remembering for future conversations. For each fact, provide a "cue" (what would trigger recalling this) and a "target" (the fact itself). Rate importance 0-1 (1 = critical fact, 0 = trivial). User: {user} Assistant: {assistant} Output format (one per line): CUE: | TARGET: | IMPORTANCE: <0-1> Only extract genuinely useful facts. If nothing worth remembering, output NONE.""", "entity_focused": """从这段对话中提取值得记住的事实。重点关注: - 名称、代号、别名(谁叫什么) - 配置、参数、端口、地址 - 做了什么操作、改了什么 - 决策和原因 每条事实用以下格式输出(每行一条): CUE: <用什么问题能想起这件事> | TARGET: <事实本身,要具体> | IMPORTANCE: <0-1> User: {user} Assistant: {assistant} 如果没有值得记住的,输出 NONE。""", "multi_cue": """从这段对话中提取值得长期记住的事实。 要求: 1. 每条事实提供 2-3 个不同的触发短语(cue),用分号分隔 2. target 要具体、独立可理解(不依赖上下文) 3. 包含所有出现的名称、代号、配置值 格式(每行一条): CUE: <触发短语1>; <触发短语2>; <触发短语3> | TARGET: <具体事实> | IMPORTANCE: <0-1> User: {user} Assistant: {assistant} 没有值得记住的则输出 NONE。""", "qa_style": """你是一个记忆提取器。把这段对话变成若干个"问答对"——未来有人问这个问题时,能直接给出答案。 要求: - 问题要自然,像人真的会这么问 - 答案要具体完整,包含关键细节(名称、数字、地址等) - 同一个事实可以从不同角度提问 格式(每行一条): CUE: <自然的提问方式> | TARGET: <完整的回答> | IMPORTANCE: <0-1> User: {user} Assistant: {assistant} 没有值得记住的则输出 NONE。""", } import re def extract_with_prompt(prompt_template, user_msg, asst_msg): prompt = prompt_template.format(user=user_msg, assistant=asst_msg) try: resp = LLM.chat.completions.create( model=MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.3, max_tokens=512, ) result = resp.choices[0].message.content except Exception as e: return [] memories = [] for line in result.strip().split("\n"): if line.strip() == "NONE": break m = re.match(r"CUE:\s*(.+?)\s*\|\s*TARGET:\s*(.+?)\s*\|\s*IMPORTANCE:\s*([\d.]+)", line) if m: memories.append({ "cue": m.group(1).strip(), "target": m.group(2).strip(), "importance": float(m.group(3)), }) return memories def emb(text): return encoder.encode([text], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0] def test_recall(memories_list, queries): """Build a memory from extracted memories and test recall.""" hip = HippocampalMemory(embed_dim=384, beta=32.0, hopfield_top_k=10, device=DEVICE) for mem in memories_list: cue_text = mem["cue"] target_text = mem["target"] cue_emb = emb(cue_text) target_emb = emb(target_text) # handle multi-cue (semicolon separated) variants = [] if ";" in cue_text: parts = [p.strip() for p in cue_text.split(";") if p.strip()] if len(parts) > 1: cue_emb = emb(parts[0]) variants = [emb(p) for p in parts[1:]] hip.store(cue_emb, target_emb, cue_variants=variants if variants else None, metadata={"cue": cue_text, "target": target_text}) hits = 0 for query, keywords in queries: qe = emb(query) results = hip.recall(qe, top_k=3) recalled_text = " ".join(r.metadata["target"] for r in results) hit = any(kw.lower() in recalled_text.lower() for kw in keywords) if hit: hits += 1 return hits, len(queries) def main(): print("extraction prompt experiment\n") print(f"turns: {len(TURNS)}, queries: {len(TARGET_QUERIES)}\n") for name, template in PROMPTS.items(): print(f"{'='*60}") print(f" prompt: {name}") print(f"{'='*60}") all_memories = [] for user_msg, asst_msg in TURNS: mems = extract_with_prompt(template, user_msg, asst_msg) all_memories.extend(mems) for m in mems: print(f" [{m['importance']:.1f}] CUE: {m['cue'][:50]}") print(f" TGT: {m['target'][:60]}") print(f"\n extracted: {len(all_memories)} memories") hits, total = test_recall(all_memories, TARGET_QUERIES) print(f" recall: {hits}/{total} ({hits/total*100:.0f}%)") # show per-query results hip = HippocampalMemory(embed_dim=384, beta=32.0, hopfield_top_k=10, device=DEVICE) for mem in all_memories: cue_text = mem["cue"] cue_emb = emb(cue_text.split(";")[0].strip() if ";" in cue_text else cue_text) target_emb = emb(mem["target"]) variants = [] if ";" in cue_text: parts = [p.strip() for p in cue_text.split(";") if p.strip()] variants = [emb(p) for p in parts[1:]] if len(parts) > 1 else [] hip.store(cue_emb, target_emb, cue_variants=variants or None, metadata={"cue": cue_text, "target": mem["target"]}) for query, keywords in TARGET_QUERIES: qe = emb(query) results = hip.recall(qe, top_k=1) if results: target = results[0].metadata["target"][:60] hit = any(kw.lower() in results[0].metadata["target"].lower() for kw in keywords) mark = "✓" if hit else "✗" print(f" {mark} {query:20s} → {target}") else: print(f" ✗ {query:20s} → (empty)") print() if __name__ == "__main__": main()