noc/mem/import_claude.py

"""Import Claude Code conversation history into nocmem.

Scans ~/.claude/projects/ for JSONL conversation files,
extracts user-assistant turn pairs, and ingests them via /ingest API.

Usage:
    uv run python import_claude.py [--dry-run] [--limit N]
"""

import argparse
import json
import os
import sys
import time
from pathlib import Path

import requests

BASE = os.environ.get("NOCMEM_ENDPOINT", "http://127.0.0.1:9820")
CLAUDE_DIR = Path.home() / ".claude" / "projects"


def extract_turns(jsonl_path: Path) -> list[tuple[str, str]]:
    """Extract (user_msg, assistant_msg) pairs from a JSONL conversation."""
    messages = []  # (role, text)

    with open(jsonl_path) as f:
        for line in f:
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            msg_type = obj.get("type")
            if msg_type not in ("user", "assistant"):
                continue

            msg = obj.get("message", {})
            content = msg.get("content", "")

            # extract text from content
            if isinstance(content, str):
                text = content.strip()
            elif isinstance(content, list):
                parts = []
                for part in content:
                    if isinstance(part, dict) and part.get("type") == "text":
                        parts.append(part["text"])
                text = "\n".join(parts).strip()
            else:
                continue

            if not text or len(text) < 10:
                continue

            # skip tool-heavy assistant responses (mostly noise)
            if msg_type == "assistant" and text.count("```") > 10:
                continue

            role = "user" if msg_type == "user" else "assistant"
            messages.append((role, text))

    # pair up user-assistant turns
    turns = []
    i = 0
    while i < len(messages) - 1:
        if messages[i][0] == "user":
            # find next assistant
            j = i + 1
            while j < len(messages) and messages[j][0] != "assistant":
                j += 1
            if j < len(messages):
                user_text = messages[i][1][:500]  # truncate long messages
                asst_text = messages[j][1][:500]
                turns.append((user_text, asst_text))
            i = j + 1
        else:
            i += 1

    return turns


def ingest_turn(user_msg: str, assistant_msg: str) -> int:
    """Send a turn to nocmem /ingest, return number of memories stored."""
    try:
        r = requests.post(
            f"{BASE}/ingest",
            json={"user_msg": user_msg, "assistant_msg": assistant_msg},
            timeout=120,
        )
        if r.status_code == 200:
            return r.json().get("stored", 0)
    except Exception as e:
        print(f"  error: {e}", file=sys.stderr)
    return 0


def main():
    parser = argparse.ArgumentParser(description="Import Claude Code history into nocmem")
    parser.add_argument("--dry-run", action="store_true", help="just show what would be imported")
    parser.add_argument("--limit", type=int, default=0, help="max turns to ingest (0=all)")
    parser.add_argument("--project", type=str, default="", help="filter by project dir name substring")
    args = parser.parse_args()

    # find all conversation files
    conversations = []
    for project_dir in sorted(CLAUDE_DIR.iterdir()):
        if not project_dir.is_dir():
            continue
        if args.project and args.project not in project_dir.name:
            continue
        for jsonl in sorted(project_dir.glob("*.jsonl")):
            if "subagents" in str(jsonl):
                continue
            conversations.append((project_dir.name, jsonl))

    print(f"found {len(conversations)} conversations in {CLAUDE_DIR}")
    if args.project:
        print(f"  filtered by: {args.project}")

    # extract all turns
    all_turns = []
    for project_name, jsonl_path in conversations:
        turns = extract_turns(jsonl_path)
        if turns:
            all_turns.extend([(project_name, u, a) for u, a in turns])

    print(f"extracted {len(all_turns)} turns total\n")

    if args.limit:
        all_turns = all_turns[:args.limit]

    if args.dry_run:
        for project, user_msg, asst_msg in all_turns[:20]:
            print(f"  [{project[:30]}]")
            print(f"    U: {user_msg[:80]}")
            print(f"    A: {asst_msg[:80]}")
            print()
        if len(all_turns) > 20:
            print(f"  ... and {len(all_turns) - 20} more")
        return

    # check server
    try:
        r = requests.get(f"{BASE}/stats", timeout=3)
        r.raise_for_status()
        before = r.json()["num_memories"]
        print(f"nocmem: {before} memories before import\n")
    except Exception:
        print(f"ERROR: nocmem not reachable at {BASE}")
        sys.exit(1)

    # ingest
    total_stored = 0
    t0 = time.monotonic()
    for i, (project, user_msg, asst_msg) in enumerate(all_turns):
        stored = ingest_turn(user_msg, asst_msg)
        total_stored += stored
        if (i + 1) % 10 == 0:
            elapsed = time.monotonic() - t0
            rate = (i + 1) / elapsed
            eta = (len(all_turns) - i - 1) / rate if rate > 0 else 0
            print(f"  [{i+1}/{len(all_turns)}] stored={total_stored} ({rate:.1f} turns/s, ETA {eta:.0f}s)")

    elapsed = time.monotonic() - t0

    # final stats
    r = requests.get(f"{BASE}/stats")
    after = r.json()["num_memories"]

    print(f"\n{'='*50}")
    print(f"imported {total_stored} memories from {len(all_turns)} turns")
    print(f"nocmem: {before} → {after} memories")
    print(f"time: {elapsed:.1f}s")


if __name__ == "__main__":
    main()