- gen_voice: IndexTTS2 voice cloning via tools/gen_voice script, ref audio cached on server to avoid re-upload - Message timestamps: created_at column in messages table, prepended to content in API calls so LLM sees message times - Image understanding: photos converted to base64 multimodal content for vision-capable models - Group chat: independent session contexts per chat_id, sendMessageDraft disabled in groups (private chat only) - Voice transcription: whisper service integration, transcribed text injected as [语音消息] prefix - Integration tests marked #[ignore] (require external services) - Reference voice asset: assets/ref_voice.mp3 - .gitignore: target/, noc.service, config/state/db files
153 lines
4.4 KiB
Plaintext
Executable File
153 lines
4.4 KiB
Plaintext
Executable File
#!/usr/bin/env -S uv run --script
|
|
# /// script
|
|
# requires-python = ">=3.11"
|
|
# dependencies = ["requests"]
|
|
# ///
|
|
"""Generate voice audio using IndexTTS2 with a fixed reference voice.
|
|
|
|
Usage:
|
|
./gen_voice --schema
|
|
./gen_voice '{"text":"你好世界"}'
|
|
./gen_voice 你好世界
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import requests
|
|
|
|
INDEXTTS_URL = "http://100.107.41.75:7860"
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
REF_AUDIO = os.path.join(SCRIPT_DIR, "..", "assets", "ref_voice.mp3")
|
|
OUTPUT_DIR = os.path.expanduser("~/down")
|
|
|
|
# cache the uploaded ref path to avoid re-uploading
|
|
_CACHE_FILE = "/tmp/noc_gen_voice_ref_cache.json"
|
|
|
|
SCHEMA = {
|
|
"name": "gen_voice",
|
|
"description": "Generate speech audio from text using voice cloning (IndexTTS2). Returns the file path of the generated wav. Use send_file to send it to the user.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"text": {
|
|
"type": "string",
|
|
"description": "The text to synthesize into speech",
|
|
},
|
|
},
|
|
"required": ["text"],
|
|
},
|
|
}
|
|
|
|
|
|
def get_ref_path():
|
|
"""Upload ref audio once, cache the server-side path. Invalidate if server restarted."""
|
|
# check cache — validate against server uptime
|
|
if os.path.exists(_CACHE_FILE):
|
|
try:
|
|
with open(_CACHE_FILE) as f:
|
|
cache = json.load(f)
|
|
# quick health check — if server is up and path exists, reuse
|
|
r = requests.head(f"{INDEXTTS_URL}/gradio_api/file={cache['path']}", timeout=3)
|
|
if r.status_code == 200:
|
|
return cache["path"]
|
|
except Exception:
|
|
pass
|
|
|
|
# upload
|
|
with open(REF_AUDIO, "rb") as f:
|
|
resp = requests.post(f"{INDEXTTS_URL}/gradio_api/upload", files={"files": f})
|
|
resp.raise_for_status()
|
|
ref_path = resp.json()[0]
|
|
|
|
# cache
|
|
with open(_CACHE_FILE, "w") as f:
|
|
json.dump({"path": ref_path}, f)
|
|
|
|
return ref_path
|
|
|
|
|
|
def synthesize(text):
|
|
ref = get_ref_path()
|
|
file_data = {"path": ref, "meta": {"_type": "gradio.FileData"}}
|
|
|
|
# submit job
|
|
resp = requests.post(
|
|
f"{INDEXTTS_URL}/gradio_api/call/synthesize",
|
|
json={
|
|
"data": [
|
|
text,
|
|
file_data, # spk_audio
|
|
file_data, # emo_audio
|
|
0.5, # emo_alpha
|
|
0, 0, 0, 0, 0, 0, 0, 0.8, # emotions (calm=0.8)
|
|
False, # use_emo_text
|
|
"", # emo_text
|
|
False, # use_random
|
|
]
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
event_id = resp.json()["event_id"]
|
|
|
|
# poll result via SSE
|
|
result_resp = requests.get(
|
|
f"{INDEXTTS_URL}/gradio_api/call/synthesize/{event_id}", stream=True
|
|
)
|
|
for line in result_resp.iter_lines(decode_unicode=True):
|
|
if line.startswith("data: "):
|
|
data = json.loads(line[6:])
|
|
if isinstance(data, list) and data:
|
|
url = data[0].get("url", "")
|
|
if url:
|
|
# download the wav
|
|
wav = requests.get(url)
|
|
wav.raise_for_status()
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
ts = time.strftime("%Y%m%d_%H%M%S")
|
|
out_path = os.path.join(OUTPUT_DIR, f"tts_{ts}.wav")
|
|
with open(out_path, "wb") as f:
|
|
f.write(wav.content)
|
|
return out_path
|
|
elif data is None:
|
|
raise RuntimeError("TTS synthesis failed (server returned null)")
|
|
|
|
raise RuntimeError("No result received from TTS server")
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2 or sys.argv[1] in ("--help", "-h"):
|
|
print(__doc__.strip())
|
|
sys.exit(0)
|
|
|
|
if sys.argv[1] == "--schema":
|
|
print(json.dumps(SCHEMA, ensure_ascii=False))
|
|
sys.exit(0)
|
|
|
|
arg = sys.argv[1]
|
|
if not arg.startswith("{"):
|
|
text = " ".join(sys.argv[1:])
|
|
else:
|
|
try:
|
|
args = json.loads(arg)
|
|
text = args.get("text", "")
|
|
except json.JSONDecodeError as e:
|
|
print(f"Invalid JSON: {e}")
|
|
sys.exit(1)
|
|
|
|
if not text:
|
|
print("Error: text is required")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
path = synthesize(text)
|
|
print(path)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|