nocmem: qa-style extraction prompt, multi-cue variants, claude history importer
- Switch extraction prompt to qa-style (80% recall vs 60% baseline) - Semicolon-separated cues in extraction become paraphrase variants - Add import_claude.py to bulk-import Claude Code conversation history - Fix LLM model name in systemd service, add logging basicConfig
This commit is contained in:
178
mem/import_claude.py
Normal file
178
mem/import_claude.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""Import Claude Code conversation history into nocmem.
|
||||
|
||||
Scans ~/.claude/projects/ for JSONL conversation files,
|
||||
extracts user-assistant turn pairs, and ingests them via /ingest API.
|
||||
|
||||
Usage:
|
||||
uv run python import_claude.py [--dry-run] [--limit N]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
BASE = os.environ.get("NOCMEM_ENDPOINT", "http://127.0.0.1:9820")
|
||||
CLAUDE_DIR = Path.home() / ".claude" / "projects"
|
||||
|
||||
|
||||
def extract_turns(jsonl_path: Path) -> list[tuple[str, str]]:
|
||||
"""Extract (user_msg, assistant_msg) pairs from a JSONL conversation."""
|
||||
messages = [] # (role, text)
|
||||
|
||||
with open(jsonl_path) as f:
|
||||
for line in f:
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
msg_type = obj.get("type")
|
||||
if msg_type not in ("user", "assistant"):
|
||||
continue
|
||||
|
||||
msg = obj.get("message", {})
|
||||
content = msg.get("content", "")
|
||||
|
||||
# extract text from content
|
||||
if isinstance(content, str):
|
||||
text = content.strip()
|
||||
elif isinstance(content, list):
|
||||
parts = []
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") == "text":
|
||||
parts.append(part["text"])
|
||||
text = "\n".join(parts).strip()
|
||||
else:
|
||||
continue
|
||||
|
||||
if not text or len(text) < 10:
|
||||
continue
|
||||
|
||||
# skip tool-heavy assistant responses (mostly noise)
|
||||
if msg_type == "assistant" and text.count("```") > 10:
|
||||
continue
|
||||
|
||||
role = "user" if msg_type == "user" else "assistant"
|
||||
messages.append((role, text))
|
||||
|
||||
# pair up user-assistant turns
|
||||
turns = []
|
||||
i = 0
|
||||
while i < len(messages) - 1:
|
||||
if messages[i][0] == "user":
|
||||
# find next assistant
|
||||
j = i + 1
|
||||
while j < len(messages) and messages[j][0] != "assistant":
|
||||
j += 1
|
||||
if j < len(messages):
|
||||
user_text = messages[i][1][:500] # truncate long messages
|
||||
asst_text = messages[j][1][:500]
|
||||
turns.append((user_text, asst_text))
|
||||
i = j + 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return turns
|
||||
|
||||
|
||||
def ingest_turn(user_msg: str, assistant_msg: str) -> int:
|
||||
"""Send a turn to nocmem /ingest, return number of memories stored."""
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{BASE}/ingest",
|
||||
json={"user_msg": user_msg, "assistant_msg": assistant_msg},
|
||||
timeout=120,
|
||||
)
|
||||
if r.status_code == 200:
|
||||
return r.json().get("stored", 0)
|
||||
except Exception as e:
|
||||
print(f" error: {e}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Import Claude Code history into nocmem")
|
||||
parser.add_argument("--dry-run", action="store_true", help="just show what would be imported")
|
||||
parser.add_argument("--limit", type=int, default=0, help="max turns to ingest (0=all)")
|
||||
parser.add_argument("--project", type=str, default="", help="filter by project dir name substring")
|
||||
args = parser.parse_args()
|
||||
|
||||
# find all conversation files
|
||||
conversations = []
|
||||
for project_dir in sorted(CLAUDE_DIR.iterdir()):
|
||||
if not project_dir.is_dir():
|
||||
continue
|
||||
if args.project and args.project not in project_dir.name:
|
||||
continue
|
||||
for jsonl in sorted(project_dir.glob("*.jsonl")):
|
||||
if "subagents" in str(jsonl):
|
||||
continue
|
||||
conversations.append((project_dir.name, jsonl))
|
||||
|
||||
print(f"found {len(conversations)} conversations in {CLAUDE_DIR}")
|
||||
if args.project:
|
||||
print(f" filtered by: {args.project}")
|
||||
|
||||
# extract all turns
|
||||
all_turns = []
|
||||
for project_name, jsonl_path in conversations:
|
||||
turns = extract_turns(jsonl_path)
|
||||
if turns:
|
||||
all_turns.extend([(project_name, u, a) for u, a in turns])
|
||||
|
||||
print(f"extracted {len(all_turns)} turns total\n")
|
||||
|
||||
if args.limit:
|
||||
all_turns = all_turns[:args.limit]
|
||||
|
||||
if args.dry_run:
|
||||
for project, user_msg, asst_msg in all_turns[:20]:
|
||||
print(f" [{project[:30]}]")
|
||||
print(f" U: {user_msg[:80]}")
|
||||
print(f" A: {asst_msg[:80]}")
|
||||
print()
|
||||
if len(all_turns) > 20:
|
||||
print(f" ... and {len(all_turns) - 20} more")
|
||||
return
|
||||
|
||||
# check server
|
||||
try:
|
||||
r = requests.get(f"{BASE}/stats", timeout=3)
|
||||
r.raise_for_status()
|
||||
before = r.json()["num_memories"]
|
||||
print(f"nocmem: {before} memories before import\n")
|
||||
except Exception:
|
||||
print(f"ERROR: nocmem not reachable at {BASE}")
|
||||
sys.exit(1)
|
||||
|
||||
# ingest
|
||||
total_stored = 0
|
||||
t0 = time.monotonic()
|
||||
for i, (project, user_msg, asst_msg) in enumerate(all_turns):
|
||||
stored = ingest_turn(user_msg, asst_msg)
|
||||
total_stored += stored
|
||||
if (i + 1) % 10 == 0:
|
||||
elapsed = time.monotonic() - t0
|
||||
rate = (i + 1) / elapsed
|
||||
eta = (len(all_turns) - i - 1) / rate if rate > 0 else 0
|
||||
print(f" [{i+1}/{len(all_turns)}] stored={total_stored} ({rate:.1f} turns/s, ETA {eta:.0f}s)")
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
# final stats
|
||||
r = requests.get(f"{BASE}/stats")
|
||||
after = r.json()["num_memories"]
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"imported {total_stored} memories from {len(all_turns)} turns")
|
||||
print(f"nocmem: {before} → {after} memories")
|
||||
print(f"time: {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user