Files
noc/tools/gen_voice
Fam Zheng ec1bd7cb25 add gen_voice tool, message timestamps, image multimodal, group chat, whisper STT
- gen_voice: IndexTTS2 voice cloning via tools/gen_voice script, ref audio
  cached on server to avoid re-upload
- Message timestamps: created_at column in messages table, prepended to
  content in API calls so LLM sees message times
- Image understanding: photos converted to base64 multimodal content
  for vision-capable models
- Group chat: independent session contexts per chat_id, sendMessageDraft
  disabled in groups (private chat only)
- Voice transcription: whisper service integration, transcribed text
  injected as [语音消息] prefix
- Integration tests marked #[ignore] (require external services)
- Reference voice asset: assets/ref_voice.mp3
- .gitignore: target/, noc.service, config/state/db files
2026-04-09 20:12:15 +01:00

153 lines
4.4 KiB
Plaintext
Executable File

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = ["requests"]
# ///
"""Generate voice audio using IndexTTS2 with a fixed reference voice.
Usage:
./gen_voice --schema
./gen_voice '{"text":"你好世界"}'
./gen_voice 你好世界
"""
import json
import os
import sys
import time
import requests
INDEXTTS_URL = "http://100.107.41.75:7860"
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
REF_AUDIO = os.path.join(SCRIPT_DIR, "..", "assets", "ref_voice.mp3")
OUTPUT_DIR = os.path.expanduser("~/down")
# cache the uploaded ref path to avoid re-uploading
_CACHE_FILE = "/tmp/noc_gen_voice_ref_cache.json"
SCHEMA = {
"name": "gen_voice",
"description": "Generate speech audio from text using voice cloning (IndexTTS2). Returns the file path of the generated wav. Use send_file to send it to the user.",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to synthesize into speech",
},
},
"required": ["text"],
},
}
def get_ref_path():
"""Upload ref audio once, cache the server-side path. Invalidate if server restarted."""
# check cache — validate against server uptime
if os.path.exists(_CACHE_FILE):
try:
with open(_CACHE_FILE) as f:
cache = json.load(f)
# quick health check — if server is up and path exists, reuse
r = requests.head(f"{INDEXTTS_URL}/gradio_api/file={cache['path']}", timeout=3)
if r.status_code == 200:
return cache["path"]
except Exception:
pass
# upload
with open(REF_AUDIO, "rb") as f:
resp = requests.post(f"{INDEXTTS_URL}/gradio_api/upload", files={"files": f})
resp.raise_for_status()
ref_path = resp.json()[0]
# cache
with open(_CACHE_FILE, "w") as f:
json.dump({"path": ref_path}, f)
return ref_path
def synthesize(text):
ref = get_ref_path()
file_data = {"path": ref, "meta": {"_type": "gradio.FileData"}}
# submit job
resp = requests.post(
f"{INDEXTTS_URL}/gradio_api/call/synthesize",
json={
"data": [
text,
file_data, # spk_audio
file_data, # emo_audio
0.5, # emo_alpha
0, 0, 0, 0, 0, 0, 0, 0.8, # emotions (calm=0.8)
False, # use_emo_text
"", # emo_text
False, # use_random
]
},
)
resp.raise_for_status()
event_id = resp.json()["event_id"]
# poll result via SSE
result_resp = requests.get(
f"{INDEXTTS_URL}/gradio_api/call/synthesize/{event_id}", stream=True
)
for line in result_resp.iter_lines(decode_unicode=True):
if line.startswith("data: "):
data = json.loads(line[6:])
if isinstance(data, list) and data:
url = data[0].get("url", "")
if url:
# download the wav
wav = requests.get(url)
wav.raise_for_status()
os.makedirs(OUTPUT_DIR, exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")
out_path = os.path.join(OUTPUT_DIR, f"tts_{ts}.wav")
with open(out_path, "wb") as f:
f.write(wav.content)
return out_path
elif data is None:
raise RuntimeError("TTS synthesis failed (server returned null)")
raise RuntimeError("No result received from TTS server")
def main():
if len(sys.argv) < 2 or sys.argv[1] in ("--help", "-h"):
print(__doc__.strip())
sys.exit(0)
if sys.argv[1] == "--schema":
print(json.dumps(SCHEMA, ensure_ascii=False))
sys.exit(0)
arg = sys.argv[1]
if not arg.startswith("{"):
text = " ".join(sys.argv[1:])
else:
try:
args = json.loads(arg)
text = args.get("text", "")
except json.JSONDecodeError as e:
print(f"Invalid JSON: {e}")
sys.exit(1)
if not text:
print("Error: text is required")
sys.exit(1)
try:
path = synthesize(text)
print(path)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()