noc/tools/gen_voice

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = ["requests"]
# ///
"""Generate voice audio using IndexTTS2 with a fixed reference voice.

Usage:
    ./gen_voice --schema
    ./gen_voice '{"text":"你好世界"}'
    ./gen_voice 你好世界
"""

import json
import os
import sys
import time
import requests

INDEXTTS_URL = "http://100.107.41.75:7860"
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
REF_AUDIO = os.path.join(SCRIPT_DIR, "..", "assets", "ref_voice.mp3")
OUTPUT_DIR = os.path.expanduser("~/down")

# cache the uploaded ref path to avoid re-uploading
_CACHE_FILE = "/tmp/noc_gen_voice_ref_cache.json"

SCHEMA = {
    "name": "gen_voice",
    "description": "Generate speech audio from text using voice cloning (IndexTTS2). Returns the file path of the generated wav. Use send_file to send it to the user.",
    "parameters": {
        "type": "object",
        "properties": {
            "text": {
                "type": "string",
                "description": "The text to synthesize into speech",
            },
        },
        "required": ["text"],
    },
}


def get_ref_path():
    """Upload ref audio once, cache the server-side path. Invalidate if server restarted."""
    # check cache — validate against server uptime
    if os.path.exists(_CACHE_FILE):
        try:
            with open(_CACHE_FILE) as f:
                cache = json.load(f)
            # quick health check — if server is up and path exists, reuse
            r = requests.head(f"{INDEXTTS_URL}/gradio_api/file={cache['path']}", timeout=3)
            if r.status_code == 200:
                return cache["path"]
        except Exception:
            pass

    # upload
    with open(REF_AUDIO, "rb") as f:
        resp = requests.post(f"{INDEXTTS_URL}/gradio_api/upload", files={"files": f})
    resp.raise_for_status()
    ref_path = resp.json()[0]

    # cache
    with open(_CACHE_FILE, "w") as f:
        json.dump({"path": ref_path}, f)

    return ref_path


def synthesize(text):
    ref = get_ref_path()
    file_data = {"path": ref, "meta": {"_type": "gradio.FileData"}}

    # submit job
    resp = requests.post(
        f"{INDEXTTS_URL}/gradio_api/call/synthesize",
        json={
            "data": [
                text,
                file_data,  # spk_audio
                file_data,  # emo_audio
                0.5,  # emo_alpha
                0, 0, 0, 0, 0, 0, 0, 0.8,  # emotions (calm=0.8)
                False,  # use_emo_text
                "",  # emo_text
                False,  # use_random
            ]
        },
    )
    resp.raise_for_status()
    event_id = resp.json()["event_id"]

    # poll result via SSE
    result_resp = requests.get(
        f"{INDEXTTS_URL}/gradio_api/call/synthesize/{event_id}", stream=True
    )
    for line in result_resp.iter_lines(decode_unicode=True):
        if line.startswith("data: "):
            data = json.loads(line[6:])
            if isinstance(data, list) and data:
                url = data[0].get("url", "")
                if url:
                    # download the wav
                    wav = requests.get(url)
                    wav.raise_for_status()
                    os.makedirs(OUTPUT_DIR, exist_ok=True)
                    ts = time.strftime("%Y%m%d_%H%M%S")
                    out_path = os.path.join(OUTPUT_DIR, f"tts_{ts}.wav")
                    with open(out_path, "wb") as f:
                        f.write(wav.content)
                    return out_path
            elif data is None:
                raise RuntimeError("TTS synthesis failed (server returned null)")

    raise RuntimeError("No result received from TTS server")


def main():
    if len(sys.argv) < 2 or sys.argv[1] in ("--help", "-h"):
        print(__doc__.strip())
        sys.exit(0)

    if sys.argv[1] == "--schema":
        print(json.dumps(SCHEMA, ensure_ascii=False))
        sys.exit(0)

    arg = sys.argv[1]
    if not arg.startswith("{"):
        text = " ".join(sys.argv[1:])
    else:
        try:
            args = json.loads(arg)
            text = args.get("text", "")
        except json.JSONDecodeError as e:
            print(f"Invalid JSON: {e}")
            sys.exit(1)

    if not text:
        print("Error: text is required")
        sys.exit(1)

    try:
        path = synthesize(text)
        print(path)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()