Files
noc/mem/import_claude.py
Fam Zheng 35cafbd4ca nocmem: qa-style extraction prompt, multi-cue variants, claude history importer
- Switch extraction prompt to qa-style (80% recall vs 60% baseline)
- Semicolon-separated cues in extraction become paraphrase variants
- Add import_claude.py to bulk-import Claude Code conversation history
- Fix LLM model name in systemd service, add logging basicConfig
2026-04-11 22:57:17 +01:00

179 lines
5.6 KiB
Python

"""Import Claude Code conversation history into nocmem.
Scans ~/.claude/projects/ for JSONL conversation files,
extracts user-assistant turn pairs, and ingests them via /ingest API.
Usage:
uv run python import_claude.py [--dry-run] [--limit N]
"""
import argparse
import json
import os
import sys
import time
from pathlib import Path
import requests
BASE = os.environ.get("NOCMEM_ENDPOINT", "http://127.0.0.1:9820")
CLAUDE_DIR = Path.home() / ".claude" / "projects"
def extract_turns(jsonl_path: Path) -> list[tuple[str, str]]:
"""Extract (user_msg, assistant_msg) pairs from a JSONL conversation."""
messages = [] # (role, text)
with open(jsonl_path) as f:
for line in f:
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
msg_type = obj.get("type")
if msg_type not in ("user", "assistant"):
continue
msg = obj.get("message", {})
content = msg.get("content", "")
# extract text from content
if isinstance(content, str):
text = content.strip()
elif isinstance(content, list):
parts = []
for part in content:
if isinstance(part, dict) and part.get("type") == "text":
parts.append(part["text"])
text = "\n".join(parts).strip()
else:
continue
if not text or len(text) < 10:
continue
# skip tool-heavy assistant responses (mostly noise)
if msg_type == "assistant" and text.count("```") > 10:
continue
role = "user" if msg_type == "user" else "assistant"
messages.append((role, text))
# pair up user-assistant turns
turns = []
i = 0
while i < len(messages) - 1:
if messages[i][0] == "user":
# find next assistant
j = i + 1
while j < len(messages) and messages[j][0] != "assistant":
j += 1
if j < len(messages):
user_text = messages[i][1][:500] # truncate long messages
asst_text = messages[j][1][:500]
turns.append((user_text, asst_text))
i = j + 1
else:
i += 1
return turns
def ingest_turn(user_msg: str, assistant_msg: str) -> int:
"""Send a turn to nocmem /ingest, return number of memories stored."""
try:
r = requests.post(
f"{BASE}/ingest",
json={"user_msg": user_msg, "assistant_msg": assistant_msg},
timeout=120,
)
if r.status_code == 200:
return r.json().get("stored", 0)
except Exception as e:
print(f" error: {e}", file=sys.stderr)
return 0
def main():
parser = argparse.ArgumentParser(description="Import Claude Code history into nocmem")
parser.add_argument("--dry-run", action="store_true", help="just show what would be imported")
parser.add_argument("--limit", type=int, default=0, help="max turns to ingest (0=all)")
parser.add_argument("--project", type=str, default="", help="filter by project dir name substring")
args = parser.parse_args()
# find all conversation files
conversations = []
for project_dir in sorted(CLAUDE_DIR.iterdir()):
if not project_dir.is_dir():
continue
if args.project and args.project not in project_dir.name:
continue
for jsonl in sorted(project_dir.glob("*.jsonl")):
if "subagents" in str(jsonl):
continue
conversations.append((project_dir.name, jsonl))
print(f"found {len(conversations)} conversations in {CLAUDE_DIR}")
if args.project:
print(f" filtered by: {args.project}")
# extract all turns
all_turns = []
for project_name, jsonl_path in conversations:
turns = extract_turns(jsonl_path)
if turns:
all_turns.extend([(project_name, u, a) for u, a in turns])
print(f"extracted {len(all_turns)} turns total\n")
if args.limit:
all_turns = all_turns[:args.limit]
if args.dry_run:
for project, user_msg, asst_msg in all_turns[:20]:
print(f" [{project[:30]}]")
print(f" U: {user_msg[:80]}")
print(f" A: {asst_msg[:80]}")
print()
if len(all_turns) > 20:
print(f" ... and {len(all_turns) - 20} more")
return
# check server
try:
r = requests.get(f"{BASE}/stats", timeout=3)
r.raise_for_status()
before = r.json()["num_memories"]
print(f"nocmem: {before} memories before import\n")
except Exception:
print(f"ERROR: nocmem not reachable at {BASE}")
sys.exit(1)
# ingest
total_stored = 0
t0 = time.monotonic()
for i, (project, user_msg, asst_msg) in enumerate(all_turns):
stored = ingest_turn(user_msg, asst_msg)
total_stored += stored
if (i + 1) % 10 == 0:
elapsed = time.monotonic() - t0
rate = (i + 1) / elapsed
eta = (len(all_turns) - i - 1) / rate if rate > 0 else 0
print(f" [{i+1}/{len(all_turns)}] stored={total_stored} ({rate:.1f} turns/s, ETA {eta:.0f}s)")
elapsed = time.monotonic() - t0
# final stats
r = requests.get(f"{BASE}/stats")
after = r.json()["num_memories"]
print(f"\n{'='*50}")
print(f"imported {total_stored} memories from {len(all_turns)} turns")
print(f"nocmem: {before}{after} memories")
print(f"time: {elapsed:.1f}s")
if __name__ == "__main__":
main()