- Switch extraction prompt to qa-style (80% recall vs 60% baseline) - Semicolon-separated cues in extraction become paraphrase variants - Add import_claude.py to bulk-import Claude Code conversation history - Fix LLM model name in systemd service, add logging basicConfig
218 lines
7.9 KiB
Python
218 lines
7.9 KiB
Python
"""Test different extraction prompts for ingest quality.
|
||
|
||
Takes a few real conversation turns, tries different prompts,
|
||
compares what gets extracted, then tests recall against target queries.
|
||
"""
|
||
|
||
import json
|
||
import time
|
||
import requests
|
||
import torch
|
||
from sentence_transformers import SentenceTransformer
|
||
from nuonuo.hippocampus import HippocampalMemory
|
||
from openai import OpenAI
|
||
|
||
DEVICE = "cuda"
|
||
LLM = OpenAI(base_url="http://100.84.7.49:8000/v1", api_key="unused")
|
||
MODEL = "gemma4-31b"
|
||
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
|
||
|
||
# real conversation turns from Claude Code history
|
||
TURNS = [
|
||
("装好了tailscale。你帮忙生成吧,用新的bot名叫yoyo",
|
||
"好,Tailscale IP `100.116.213.25`,yoyo 用户创建成功。继续搞 token 和部署。"),
|
||
("能来一个org共享的gitea ci cd runner 不,就跑在这台机器上就行。",
|
||
"搞起来!装 act_runner,注册到 euphon org,以后每个仓库直接加 `.gitea/workflows/` 就能跑 CI 了。"),
|
||
("能不能给heradev一个bridge的nic,secondary的,这样就有192.168.1.0/24的地址可以用了",
|
||
"饭团,你是要给 heradev 这个 KVM 虚拟机加一个桥接网卡对吧?这样它就能直接拿到物理网段 192.168.1.0/24 的 IP。"),
|
||
("i7 bot也不好,直接叫@ms吧",
|
||
"好,@ms 改好了。现在来研究全局 webhook 的问题——之前系统级 webhook 没触发,让我深入看看。"),
|
||
("AI/ML 工具克隆 全部删掉吧,emblemcv,visint也删掉",
|
||
"好嘞,这一波删掉能省 ~62G。动手了。"),
|
||
]
|
||
|
||
# queries we expect to recall
|
||
TARGET_QUERIES = [
|
||
("yoyo是什么", ["yoyo"]),
|
||
("act runner怎么装的", ["act_runner", "runner"]),
|
||
("heradev网络怎么配的", ["heradev", "桥接", "bridge", "192.168"]),
|
||
("gitea bot叫什么", ["ms", "@ms", "麻薯"]),
|
||
("清理了哪些项目", ["emblemcv", "visint", "62G", "删"]),
|
||
]
|
||
|
||
# different extraction prompts to test
|
||
PROMPTS = {
|
||
"baseline": """From this conversation turn, extract key facts worth remembering for future conversations.
|
||
For each fact, provide a "cue" (what would trigger recalling this) and a "target" (the fact itself).
|
||
Rate importance 0-1 (1 = critical fact, 0 = trivial).
|
||
|
||
User: {user}
|
||
Assistant: {assistant}
|
||
|
||
Output format (one per line):
|
||
CUE: <trigger phrase> | TARGET: <fact> | IMPORTANCE: <0-1>
|
||
|
||
Only extract genuinely useful facts. If nothing worth remembering, output NONE.""",
|
||
|
||
"entity_focused": """从这段对话中提取值得记住的事实。重点关注:
|
||
- 名称、代号、别名(谁叫什么)
|
||
- 配置、参数、端口、地址
|
||
- 做了什么操作、改了什么
|
||
- 决策和原因
|
||
|
||
每条事实用以下格式输出(每行一条):
|
||
CUE: <用什么问题能想起这件事> | TARGET: <事实本身,要具体> | IMPORTANCE: <0-1>
|
||
|
||
User: {user}
|
||
Assistant: {assistant}
|
||
|
||
如果没有值得记住的,输出 NONE。""",
|
||
|
||
"multi_cue": """从这段对话中提取值得长期记住的事实。
|
||
|
||
要求:
|
||
1. 每条事实提供 2-3 个不同的触发短语(cue),用分号分隔
|
||
2. target 要具体、独立可理解(不依赖上下文)
|
||
3. 包含所有出现的名称、代号、配置值
|
||
|
||
格式(每行一条):
|
||
CUE: <触发短语1>; <触发短语2>; <触发短语3> | TARGET: <具体事实> | IMPORTANCE: <0-1>
|
||
|
||
User: {user}
|
||
Assistant: {assistant}
|
||
|
||
没有值得记住的则输出 NONE。""",
|
||
|
||
"qa_style": """你是一个记忆提取器。把这段对话变成若干个"问答对"——未来有人问这个问题时,能直接给出答案。
|
||
|
||
要求:
|
||
- 问题要自然,像人真的会这么问
|
||
- 答案要具体完整,包含关键细节(名称、数字、地址等)
|
||
- 同一个事实可以从不同角度提问
|
||
|
||
格式(每行一条):
|
||
CUE: <自然的提问方式> | TARGET: <完整的回答> | IMPORTANCE: <0-1>
|
||
|
||
User: {user}
|
||
Assistant: {assistant}
|
||
|
||
没有值得记住的则输出 NONE。""",
|
||
}
|
||
|
||
import re
|
||
|
||
def extract_with_prompt(prompt_template, user_msg, asst_msg):
|
||
prompt = prompt_template.format(user=user_msg, assistant=asst_msg)
|
||
try:
|
||
resp = LLM.chat.completions.create(
|
||
model=MODEL,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
temperature=0.3, max_tokens=512,
|
||
)
|
||
result = resp.choices[0].message.content
|
||
except Exception as e:
|
||
return []
|
||
|
||
memories = []
|
||
for line in result.strip().split("\n"):
|
||
if line.strip() == "NONE":
|
||
break
|
||
m = re.match(r"CUE:\s*(.+?)\s*\|\s*TARGET:\s*(.+?)\s*\|\s*IMPORTANCE:\s*([\d.]+)", line)
|
||
if m:
|
||
memories.append({
|
||
"cue": m.group(1).strip(),
|
||
"target": m.group(2).strip(),
|
||
"importance": float(m.group(3)),
|
||
})
|
||
return memories
|
||
|
||
|
||
def emb(text):
|
||
return encoder.encode([text], convert_to_tensor=True, normalize_embeddings=True, device=DEVICE)[0]
|
||
|
||
|
||
def test_recall(memories_list, queries):
|
||
"""Build a memory from extracted memories and test recall."""
|
||
hip = HippocampalMemory(embed_dim=384, beta=32.0, hopfield_top_k=10, device=DEVICE)
|
||
|
||
for mem in memories_list:
|
||
cue_text = mem["cue"]
|
||
target_text = mem["target"]
|
||
cue_emb = emb(cue_text)
|
||
target_emb = emb(target_text)
|
||
|
||
# handle multi-cue (semicolon separated)
|
||
variants = []
|
||
if ";" in cue_text:
|
||
parts = [p.strip() for p in cue_text.split(";") if p.strip()]
|
||
if len(parts) > 1:
|
||
cue_emb = emb(parts[0])
|
||
variants = [emb(p) for p in parts[1:]]
|
||
|
||
hip.store(cue_emb, target_emb, cue_variants=variants if variants else None,
|
||
metadata={"cue": cue_text, "target": target_text})
|
||
|
||
hits = 0
|
||
for query, keywords in queries:
|
||
qe = emb(query)
|
||
results = hip.recall(qe, top_k=3)
|
||
recalled_text = " ".join(r.metadata["target"] for r in results)
|
||
hit = any(kw.lower() in recalled_text.lower() for kw in keywords)
|
||
if hit:
|
||
hits += 1
|
||
|
||
return hits, len(queries)
|
||
|
||
|
||
def main():
|
||
print("extraction prompt experiment\n")
|
||
print(f"turns: {len(TURNS)}, queries: {len(TARGET_QUERIES)}\n")
|
||
|
||
for name, template in PROMPTS.items():
|
||
print(f"{'='*60}")
|
||
print(f" prompt: {name}")
|
||
print(f"{'='*60}")
|
||
|
||
all_memories = []
|
||
for user_msg, asst_msg in TURNS:
|
||
mems = extract_with_prompt(template, user_msg, asst_msg)
|
||
all_memories.extend(mems)
|
||
for m in mems:
|
||
print(f" [{m['importance']:.1f}] CUE: {m['cue'][:50]}")
|
||
print(f" TGT: {m['target'][:60]}")
|
||
|
||
print(f"\n extracted: {len(all_memories)} memories")
|
||
|
||
hits, total = test_recall(all_memories, TARGET_QUERIES)
|
||
print(f" recall: {hits}/{total} ({hits/total*100:.0f}%)")
|
||
|
||
# show per-query results
|
||
hip = HippocampalMemory(embed_dim=384, beta=32.0, hopfield_top_k=10, device=DEVICE)
|
||
for mem in all_memories:
|
||
cue_text = mem["cue"]
|
||
cue_emb = emb(cue_text.split(";")[0].strip() if ";" in cue_text else cue_text)
|
||
target_emb = emb(mem["target"])
|
||
variants = []
|
||
if ";" in cue_text:
|
||
parts = [p.strip() for p in cue_text.split(";") if p.strip()]
|
||
variants = [emb(p) for p in parts[1:]] if len(parts) > 1 else []
|
||
hip.store(cue_emb, target_emb, cue_variants=variants or None,
|
||
metadata={"cue": cue_text, "target": mem["target"]})
|
||
|
||
for query, keywords in TARGET_QUERIES:
|
||
qe = emb(query)
|
||
results = hip.recall(qe, top_k=1)
|
||
if results:
|
||
target = results[0].metadata["target"][:60]
|
||
hit = any(kw.lower() in results[0].metadata["target"].lower() for kw in keywords)
|
||
mark = "✓" if hit else "✗"
|
||
print(f" {mark} {query:20s} → {target}")
|
||
else:
|
||
print(f" ✗ {query:20s} → (empty)")
|
||
|
||
print()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|