native multimodal audio, remove whisper transcription, filter media from prompt text

- Send audio files (ogg/wav/mp3/flac/m4a) as audio_url in multimodal content - Remove whisper_url config and transcribe_audio — LLM handles audio natively - Skip media files in build_prompt text (only mention non-media uploads) - Log multimodal content types and successful API calls for debugging
2026-04-11 17:58:00 +01:00
parent 7000ccda0f
commit a26d58e581
4 changed files with 58 additions and 68 deletions
--- a/src/main.rs
+++ b/src/main.rs
@@ -204,7 +204,7 @@ async fn handle_inner(
 ) -> Result<()> {
    let mut uploaded: Vec<PathBuf> = Vec::new();
    let mut download_errors: Vec<String> = Vec::new();
-    let mut transcriptions: Vec<String> = Vec::new();
+    let transcriptions: Vec<String> = Vec::new();

    if let Some(doc) = msg.document() {
        let name = doc.file_name.as_deref().unwrap_or("file");
@@ -228,20 +228,7 @@ async fn handle_inner(
        let fallback = format!("audio_{}.ogg", Local::now().format("%H%M%S"));
        let name = audio.file_name.as_deref().unwrap_or(&fallback);
        match download_tg_file(bot, &audio.file.id, name).await {
-            Ok(p) => {
-                if let Some(url) = &config.whisper_url {
-                    match transcribe_audio(url, &p).await {
-                        Ok(t) if !t.is_empty() => transcriptions.push(t),
-                        Ok(_) => uploaded.push(p),
-                        Err(e) => {
-                            warn!("transcribe failed: {e:#}");
-                            uploaded.push(p);
-                        }
-                    }
-                } else {
-                    uploaded.push(p);
-                }
-            }
+            Ok(p) => uploaded.push(p),
            Err(e) => download_errors.push(format!("audio: {e:#}")),
        }
    }
@@ -249,20 +236,7 @@ async fn handle_inner(
    if let Some(voice) = msg.voice() {
        let name = format!("voice_{}.ogg", Local::now().format("%H%M%S"));
        match download_tg_file(bot, &voice.file.id, &name).await {
-            Ok(p) => {
-                if let Some(url) = &config.whisper_url {
-                    match transcribe_audio(url, &p).await {
-                        Ok(t) if !t.is_empty() => transcriptions.push(t),
-                        Ok(_) => uploaded.push(p),
-                        Err(e) => {
-                            warn!("transcribe failed: {e:#}");
-                            uploaded.push(p);
-                        }
-                    }
-                } else {
-                    uploaded.push(p);
-                }
-            }
+            Ok(p) => uploaded.push(p),
            Err(e) => download_errors.push(format!("voice: {e:#}")),
        }
    }
@@ -502,8 +476,17 @@ fn build_prompt(
        parts.push(format!("[语音消息] {t}"));
    }

+    // only mention files that won't be sent as multimodal content
+    let multimodal_exts = ["jpg", "jpeg", "png", "gif", "webp", "mp4", "webm", "mov",
+                           "ogg", "oga", "opus", "wav", "mp3", "flac", "m4a"];
    for f in uploaded {
-        parts.push(format!("[用户上传了文件: {}]", f.display()));
+        let is_media = f.extension()
+            .and_then(|e| e.to_str())
+            .map(|e| multimodal_exts.contains(&e.to_lowercase().as_str()))
+            .unwrap_or(false);
+        if !is_media {
+            parts.push(format!("[用户上传了文件: {}]", f.display()));
+        }
    }

    for e in errors {
@@ -517,24 +500,3 @@ fn build_prompt(
    parts.join("\n")
 }

-async fn transcribe_audio(whisper_url: &str, file_path: &Path) -> Result<String> {
-    let client = reqwest::Client::builder()
-        .timeout(std::time::Duration::from_secs(60))
-        .build()?;
-    let url = format!("{}/v1/audio/transcriptions", whisper_url.trim_end_matches('/'));
-    let file_bytes = tokio::fs::read(file_path).await?;
-    let file_name = file_path
-        .file_name()
-        .and_then(|n| n.to_str())
-        .unwrap_or("audio.ogg")
-        .to_string();
-    let part = reqwest::multipart::Part::bytes(file_bytes)
-        .file_name(file_name)
-        .mime_str("audio/ogg")?;
-    let form = reqwest::multipart::Form::new()
-        .part("file", part)
-        .text("model", "base");
-    let resp = client.post(&url).multipart(form).send().await?.error_for_status()?;
-    let json: serde_json::Value = resp.json().await?;
-    Ok(json["text"].as_str().unwrap_or("").to_string())
-}