add gen_voice tool, message timestamps, image multimodal, group chat, whisper STT
- gen_voice: IndexTTS2 voice cloning via tools/gen_voice script, ref audio cached on server to avoid re-upload - Message timestamps: created_at column in messages table, prepended to content in API calls so LLM sees message times - Image understanding: photos converted to base64 multimodal content for vision-capable models - Group chat: independent session contexts per chat_id, sendMessageDraft disabled in groups (private chat only) - Voice transcription: whisper service integration, transcribed text injected as [语音消息] prefix - Integration tests marked #[ignore] (require external services) - Reference voice asset: assets/ref_voice.mp3 - .gitignore: target/, noc.service, config/state/db files
This commit is contained in:
152
tools/gen_voice
Executable file
152
tools/gen_voice
Executable file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env -S uv run --script
|
||||
# /// script
|
||||
# requires-python = ">=3.11"
|
||||
# dependencies = ["requests"]
|
||||
# ///
|
||||
"""Generate voice audio using IndexTTS2 with a fixed reference voice.
|
||||
|
||||
Usage:
|
||||
./gen_voice --schema
|
||||
./gen_voice '{"text":"你好世界"}'
|
||||
./gen_voice 你好世界
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import requests
|
||||
|
||||
INDEXTTS_URL = "http://100.107.41.75:7860"
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
REF_AUDIO = os.path.join(SCRIPT_DIR, "..", "assets", "ref_voice.mp3")
|
||||
OUTPUT_DIR = os.path.expanduser("~/down")
|
||||
|
||||
# cache the uploaded ref path to avoid re-uploading
|
||||
_CACHE_FILE = "/tmp/noc_gen_voice_ref_cache.json"
|
||||
|
||||
SCHEMA = {
|
||||
"name": "gen_voice",
|
||||
"description": "Generate speech audio from text using voice cloning (IndexTTS2). Returns the file path of the generated wav. Use send_file to send it to the user.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The text to synthesize into speech",
|
||||
},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_ref_path():
|
||||
"""Upload ref audio once, cache the server-side path. Invalidate if server restarted."""
|
||||
# check cache — validate against server uptime
|
||||
if os.path.exists(_CACHE_FILE):
|
||||
try:
|
||||
with open(_CACHE_FILE) as f:
|
||||
cache = json.load(f)
|
||||
# quick health check — if server is up and path exists, reuse
|
||||
r = requests.head(f"{INDEXTTS_URL}/gradio_api/file={cache['path']}", timeout=3)
|
||||
if r.status_code == 200:
|
||||
return cache["path"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# upload
|
||||
with open(REF_AUDIO, "rb") as f:
|
||||
resp = requests.post(f"{INDEXTTS_URL}/gradio_api/upload", files={"files": f})
|
||||
resp.raise_for_status()
|
||||
ref_path = resp.json()[0]
|
||||
|
||||
# cache
|
||||
with open(_CACHE_FILE, "w") as f:
|
||||
json.dump({"path": ref_path}, f)
|
||||
|
||||
return ref_path
|
||||
|
||||
|
||||
def synthesize(text):
|
||||
ref = get_ref_path()
|
||||
file_data = {"path": ref, "meta": {"_type": "gradio.FileData"}}
|
||||
|
||||
# submit job
|
||||
resp = requests.post(
|
||||
f"{INDEXTTS_URL}/gradio_api/call/synthesize",
|
||||
json={
|
||||
"data": [
|
||||
text,
|
||||
file_data, # spk_audio
|
||||
file_data, # emo_audio
|
||||
0.5, # emo_alpha
|
||||
0, 0, 0, 0, 0, 0, 0, 0.8, # emotions (calm=0.8)
|
||||
False, # use_emo_text
|
||||
"", # emo_text
|
||||
False, # use_random
|
||||
]
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
event_id = resp.json()["event_id"]
|
||||
|
||||
# poll result via SSE
|
||||
result_resp = requests.get(
|
||||
f"{INDEXTTS_URL}/gradio_api/call/synthesize/{event_id}", stream=True
|
||||
)
|
||||
for line in result_resp.iter_lines(decode_unicode=True):
|
||||
if line.startswith("data: "):
|
||||
data = json.loads(line[6:])
|
||||
if isinstance(data, list) and data:
|
||||
url = data[0].get("url", "")
|
||||
if url:
|
||||
# download the wav
|
||||
wav = requests.get(url)
|
||||
wav.raise_for_status()
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
ts = time.strftime("%Y%m%d_%H%M%S")
|
||||
out_path = os.path.join(OUTPUT_DIR, f"tts_{ts}.wav")
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(wav.content)
|
||||
return out_path
|
||||
elif data is None:
|
||||
raise RuntimeError("TTS synthesis failed (server returned null)")
|
||||
|
||||
raise RuntimeError("No result received from TTS server")
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2 or sys.argv[1] in ("--help", "-h"):
|
||||
print(__doc__.strip())
|
||||
sys.exit(0)
|
||||
|
||||
if sys.argv[1] == "--schema":
|
||||
print(json.dumps(SCHEMA, ensure_ascii=False))
|
||||
sys.exit(0)
|
||||
|
||||
arg = sys.argv[1]
|
||||
if not arg.startswith("{"):
|
||||
text = " ".join(sys.argv[1:])
|
||||
else:
|
||||
try:
|
||||
args = json.loads(arg)
|
||||
text = args.get("text", "")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Invalid JSON: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if not text:
|
||||
print("Error: text is required")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
path = synthesize(text)
|
||||
print(path)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user