add gen_voice tool, message timestamps, image multimodal, group chat, whisper STT

- gen_voice: IndexTTS2 voice cloning via tools/gen_voice script, ref audio cached on server to avoid re-upload - Message timestamps: created_at column in messages table, prepended to content in API calls so LLM sees message times - Image understanding: photos converted to base64 multimodal content for vision-capable models - Group chat: independent session contexts per chat_id, sendMessageDraft disabled in groups (private chat only) - Voice transcription: whisper service integration, transcribed text injected as [语音消息] prefix - Integration tests marked #[ignore] (require external services) - Reference voice asset: assets/ref_voice.mp3 - .gitignore: target/, noc.service, config/state/db files
2026-04-09 20:12:15 +01:00
parent 9d5dd4eb16
commit ec1bd7cb25
6 changed files with 370 additions and 54 deletions
--- a/tools/gen_voice
+++ b/tools/gen_voice
@@ -0,0 +1,152 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["requests"]
+# ///
+"""Generate voice audio using IndexTTS2 with a fixed reference voice.
+
+Usage:
+    ./gen_voice --schema
+    ./gen_voice '{"text":"你好世界"}'
+    ./gen_voice 你好世界
+"""
+
+import json
+import os
+import sys
+import time
+import requests
+
+INDEXTTS_URL = "http://100.107.41.75:7860"
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+REF_AUDIO = os.path.join(SCRIPT_DIR, "..", "assets", "ref_voice.mp3")
+OUTPUT_DIR = os.path.expanduser("~/down")
+
+# cache the uploaded ref path to avoid re-uploading
+_CACHE_FILE = "/tmp/noc_gen_voice_ref_cache.json"
+
+SCHEMA = {
+    "name": "gen_voice",
+    "description": "Generate speech audio from text using voice cloning (IndexTTS2). Returns the file path of the generated wav. Use send_file to send it to the user.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "text": {
+                "type": "string",
+                "description": "The text to synthesize into speech",
+            },
+        },
+        "required": ["text"],
+    },
+}
+
+
+def get_ref_path():
+    """Upload ref audio once, cache the server-side path. Invalidate if server restarted."""
+    # check cache — validate against server uptime
+    if os.path.exists(_CACHE_FILE):
+        try:
+            with open(_CACHE_FILE) as f:
+                cache = json.load(f)
+            # quick health check — if server is up and path exists, reuse
+            r = requests.head(f"{INDEXTTS_URL}/gradio_api/file={cache['path']}", timeout=3)
+            if r.status_code == 200:
+                return cache["path"]
+        except Exception:
+            pass
+
+    # upload
+    with open(REF_AUDIO, "rb") as f:
+        resp = requests.post(f"{INDEXTTS_URL}/gradio_api/upload", files={"files": f})
+    resp.raise_for_status()
+    ref_path = resp.json()[0]
+
+    # cache
+    with open(_CACHE_FILE, "w") as f:
+        json.dump({"path": ref_path}, f)
+
+    return ref_path
+
+
+def synthesize(text):
+    ref = get_ref_path()
+    file_data = {"path": ref, "meta": {"_type": "gradio.FileData"}}
+
+    # submit job
+    resp = requests.post(
+        f"{INDEXTTS_URL}/gradio_api/call/synthesize",
+        json={
+            "data": [
+                text,
+                file_data,  # spk_audio
+                file_data,  # emo_audio
+                0.5,  # emo_alpha
+                0, 0, 0, 0, 0, 0, 0, 0.8,  # emotions (calm=0.8)
+                False,  # use_emo_text
+                "",  # emo_text
+                False,  # use_random
+            ]
+        },
+    )
+    resp.raise_for_status()
+    event_id = resp.json()["event_id"]
+
+    # poll result via SSE
+    result_resp = requests.get(
+        f"{INDEXTTS_URL}/gradio_api/call/synthesize/{event_id}", stream=True
+    )
+    for line in result_resp.iter_lines(decode_unicode=True):
+        if line.startswith("data: "):
+            data = json.loads(line[6:])
+            if isinstance(data, list) and data:
+                url = data[0].get("url", "")
+                if url:
+                    # download the wav
+                    wav = requests.get(url)
+                    wav.raise_for_status()
+                    os.makedirs(OUTPUT_DIR, exist_ok=True)
+                    ts = time.strftime("%Y%m%d_%H%M%S")
+                    out_path = os.path.join(OUTPUT_DIR, f"tts_{ts}.wav")
+                    with open(out_path, "wb") as f:
+                        f.write(wav.content)
+                    return out_path
+            elif data is None:
+                raise RuntimeError("TTS synthesis failed (server returned null)")
+
+    raise RuntimeError("No result received from TTS server")
+
+
+def main():
+    if len(sys.argv) < 2 or sys.argv[1] in ("--help", "-h"):
+        print(__doc__.strip())
+        sys.exit(0)
+
+    if sys.argv[1] == "--schema":
+        print(json.dumps(SCHEMA, ensure_ascii=False))
+        sys.exit(0)
+
+    arg = sys.argv[1]
+    if not arg.startswith("{"):
+        text = " ".join(sys.argv[1:])
+    else:
+        try:
+            args = json.loads(arg)
+            text = args.get("text", "")
+        except json.JSONDecodeError as e:
+            print(f"Invalid JSON: {e}")
+            sys.exit(1)
+
+    if not text:
+        print("Error: text is required")
+        sys.exit(1)
+
+    try:
+        path = synthesize(text)
+        print(path)
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()