- Switch extraction prompt to qa-style (80% recall vs 60% baseline) - Semicolon-separated cues in extraction become paraphrase variants - Add import_claude.py to bulk-import Claude Code conversation history - Fix LLM model name in systemd service, add logging basicConfig
179 lines
5.6 KiB
Python
179 lines
5.6 KiB
Python
"""Import Claude Code conversation history into nocmem.
|
|
|
|
Scans ~/.claude/projects/ for JSONL conversation files,
|
|
extracts user-assistant turn pairs, and ingests them via /ingest API.
|
|
|
|
Usage:
|
|
uv run python import_claude.py [--dry-run] [--limit N]
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
BASE = os.environ.get("NOCMEM_ENDPOINT", "http://127.0.0.1:9820")
|
|
CLAUDE_DIR = Path.home() / ".claude" / "projects"
|
|
|
|
|
|
def extract_turns(jsonl_path: Path) -> list[tuple[str, str]]:
|
|
"""Extract (user_msg, assistant_msg) pairs from a JSONL conversation."""
|
|
messages = [] # (role, text)
|
|
|
|
with open(jsonl_path) as f:
|
|
for line in f:
|
|
try:
|
|
obj = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
msg_type = obj.get("type")
|
|
if msg_type not in ("user", "assistant"):
|
|
continue
|
|
|
|
msg = obj.get("message", {})
|
|
content = msg.get("content", "")
|
|
|
|
# extract text from content
|
|
if isinstance(content, str):
|
|
text = content.strip()
|
|
elif isinstance(content, list):
|
|
parts = []
|
|
for part in content:
|
|
if isinstance(part, dict) and part.get("type") == "text":
|
|
parts.append(part["text"])
|
|
text = "\n".join(parts).strip()
|
|
else:
|
|
continue
|
|
|
|
if not text or len(text) < 10:
|
|
continue
|
|
|
|
# skip tool-heavy assistant responses (mostly noise)
|
|
if msg_type == "assistant" and text.count("```") > 10:
|
|
continue
|
|
|
|
role = "user" if msg_type == "user" else "assistant"
|
|
messages.append((role, text))
|
|
|
|
# pair up user-assistant turns
|
|
turns = []
|
|
i = 0
|
|
while i < len(messages) - 1:
|
|
if messages[i][0] == "user":
|
|
# find next assistant
|
|
j = i + 1
|
|
while j < len(messages) and messages[j][0] != "assistant":
|
|
j += 1
|
|
if j < len(messages):
|
|
user_text = messages[i][1][:500] # truncate long messages
|
|
asst_text = messages[j][1][:500]
|
|
turns.append((user_text, asst_text))
|
|
i = j + 1
|
|
else:
|
|
i += 1
|
|
|
|
return turns
|
|
|
|
|
|
def ingest_turn(user_msg: str, assistant_msg: str) -> int:
|
|
"""Send a turn to nocmem /ingest, return number of memories stored."""
|
|
try:
|
|
r = requests.post(
|
|
f"{BASE}/ingest",
|
|
json={"user_msg": user_msg, "assistant_msg": assistant_msg},
|
|
timeout=120,
|
|
)
|
|
if r.status_code == 200:
|
|
return r.json().get("stored", 0)
|
|
except Exception as e:
|
|
print(f" error: {e}", file=sys.stderr)
|
|
return 0
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Import Claude Code history into nocmem")
|
|
parser.add_argument("--dry-run", action="store_true", help="just show what would be imported")
|
|
parser.add_argument("--limit", type=int, default=0, help="max turns to ingest (0=all)")
|
|
parser.add_argument("--project", type=str, default="", help="filter by project dir name substring")
|
|
args = parser.parse_args()
|
|
|
|
# find all conversation files
|
|
conversations = []
|
|
for project_dir in sorted(CLAUDE_DIR.iterdir()):
|
|
if not project_dir.is_dir():
|
|
continue
|
|
if args.project and args.project not in project_dir.name:
|
|
continue
|
|
for jsonl in sorted(project_dir.glob("*.jsonl")):
|
|
if "subagents" in str(jsonl):
|
|
continue
|
|
conversations.append((project_dir.name, jsonl))
|
|
|
|
print(f"found {len(conversations)} conversations in {CLAUDE_DIR}")
|
|
if args.project:
|
|
print(f" filtered by: {args.project}")
|
|
|
|
# extract all turns
|
|
all_turns = []
|
|
for project_name, jsonl_path in conversations:
|
|
turns = extract_turns(jsonl_path)
|
|
if turns:
|
|
all_turns.extend([(project_name, u, a) for u, a in turns])
|
|
|
|
print(f"extracted {len(all_turns)} turns total\n")
|
|
|
|
if args.limit:
|
|
all_turns = all_turns[:args.limit]
|
|
|
|
if args.dry_run:
|
|
for project, user_msg, asst_msg in all_turns[:20]:
|
|
print(f" [{project[:30]}]")
|
|
print(f" U: {user_msg[:80]}")
|
|
print(f" A: {asst_msg[:80]}")
|
|
print()
|
|
if len(all_turns) > 20:
|
|
print(f" ... and {len(all_turns) - 20} more")
|
|
return
|
|
|
|
# check server
|
|
try:
|
|
r = requests.get(f"{BASE}/stats", timeout=3)
|
|
r.raise_for_status()
|
|
before = r.json()["num_memories"]
|
|
print(f"nocmem: {before} memories before import\n")
|
|
except Exception:
|
|
print(f"ERROR: nocmem not reachable at {BASE}")
|
|
sys.exit(1)
|
|
|
|
# ingest
|
|
total_stored = 0
|
|
t0 = time.monotonic()
|
|
for i, (project, user_msg, asst_msg) in enumerate(all_turns):
|
|
stored = ingest_turn(user_msg, asst_msg)
|
|
total_stored += stored
|
|
if (i + 1) % 10 == 0:
|
|
elapsed = time.monotonic() - t0
|
|
rate = (i + 1) / elapsed
|
|
eta = (len(all_turns) - i - 1) / rate if rate > 0 else 0
|
|
print(f" [{i+1}/{len(all_turns)}] stored={total_stored} ({rate:.1f} turns/s, ETA {eta:.0f}s)")
|
|
|
|
elapsed = time.monotonic() - t0
|
|
|
|
# final stats
|
|
r = requests.get(f"{BASE}/stats")
|
|
after = r.json()["num_memories"]
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"imported {total_stored} memories from {len(all_turns)} turns")
|
|
print(f"nocmem: {before} → {after} memories")
|
|
print(f"time: {elapsed:.1f}s")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|