feat: status_reason field for workflows + proper failure logging

- Add status_reason column to workflows table (migration)
- AgentUpdate::WorkflowStatus and WorkflowComplete carry reason
- Dispatch failure logs to execution_log with reason
- Worker disconnect marks orphaned workflows as failed with reason
- All status transitions now have traceable cause
This commit is contained in:
2026-04-06 20:33:41 +01:00
parent 76b964998b
commit c56bfd9377
6 changed files with 66 additions and 16 deletions

View File

@@ -158,15 +158,20 @@ impl AgentManager {
tracing::info!("Workflow {} dispatched to worker '{}'", workflow_id, name); tracing::info!("Workflow {} dispatched to worker '{}'", workflow_id, name);
} }
Err(e) => { Err(e) => {
tracing::error!("Failed to dispatch workflow {}: {}", workflow_id, e); let reason = format!("调度失败: {}", e);
let _ = sqlx::query("UPDATE workflows SET status = 'failed' WHERE id = ?") tracing::error!("Failed to dispatch workflow {}: {}", workflow_id, reason);
.bind(&workflow_id).execute(&self.pool).await; let _ = sqlx::query("UPDATE workflows SET status = 'failed', status_reason = ? WHERE id = ?")
let _ = btx.send(WsMessage::WorkflowStatusUpdate { .bind(&reason).bind(&workflow_id).execute(&self.pool).await;
workflow_id, // Log to execution_log so frontend can show the reason
status: "failed".into(), let log_id = uuid::Uuid::new_v4().to_string();
let _ = sqlx::query(
"INSERT INTO execution_log (id, workflow_id, step_order, tool_name, tool_input, output, status, created_at) VALUES (?, ?, 0, 'system', 'dispatch', ?, 'failed', datetime('now'))"
).bind(&log_id).bind(&workflow_id).bind(&reason).execute(&self.pool).await;
let _ = btx.send(WsMessage::StepStatusUpdate {
step_id: log_id, status: "failed".into(), output: reason,
}); });
let _ = btx.send(WsMessage::Error { let _ = btx.send(WsMessage::WorkflowStatusUpdate {
message: format!("No worker available: {}", e), workflow_id, status: "failed".into(),
}); });
} }
} }
@@ -745,6 +750,7 @@ pub async fn run_step_loop(
let _ = update_tx.send(AgentUpdate::WorkflowStatus { let _ = update_tx.send(AgentUpdate::WorkflowStatus {
workflow_id: workflow_id.to_string(), workflow_id: workflow_id.to_string(),
status: "waiting_user".into(), status: "waiting_user".into(),
reason: String::new(),
}).await; }).await;
send_execution(update_tx, workflow_id, step_order, "ask_user", reason, reason, "waiting").await; send_execution(update_tx, workflow_id, step_order, "ask_user", reason, reason, "waiting").await;
@@ -789,6 +795,7 @@ pub async fn run_step_loop(
let _ = update_tx.send(AgentUpdate::WorkflowStatus { let _ = update_tx.send(AgentUpdate::WorkflowStatus {
workflow_id: workflow_id.to_string(), workflow_id: workflow_id.to_string(),
status: "executing".into(), status: "executing".into(),
reason: String::new(),
}).await; }).await;
let tool_msg = if feedback.is_empty() { let tool_msg = if feedback.is_empty() {
@@ -1084,6 +1091,7 @@ pub async fn run_agent_loop(
let _ = update_tx.send(AgentUpdate::WorkflowStatus { let _ = update_tx.send(AgentUpdate::WorkflowStatus {
workflow_id: workflow_id.to_string(), workflow_id: workflow_id.to_string(),
status: "waiting_user".into(), status: "waiting_user".into(),
reason: String::new(),
}).await; }).await;
send_execution(update_tx, workflow_id, 0, "plan_approval", "等待确认计划", "等待用户确认执行计划", "waiting").await; send_execution(update_tx, workflow_id, 0, "plan_approval", "等待确认计划", "等待用户确认执行计划", "waiting").await;
@@ -1114,6 +1122,7 @@ pub async fn run_agent_loop(
let _ = update_tx.send(AgentUpdate::WorkflowStatus { let _ = update_tx.send(AgentUpdate::WorkflowStatus {
workflow_id: workflow_id.to_string(), workflow_id: workflow_id.to_string(),
status: "executing".into(), status: "executing".into(),
reason: String::new(),
}).await; }).await;
// Stay in Planning phase, continue the loop // Stay in Planning phase, continue the loop
continue; continue;
@@ -1130,6 +1139,7 @@ pub async fn run_agent_loop(
let _ = update_tx.send(AgentUpdate::WorkflowStatus { let _ = update_tx.send(AgentUpdate::WorkflowStatus {
workflow_id: workflow_id.to_string(), workflow_id: workflow_id.to_string(),
status: "executing".into(), status: "executing".into(),
reason: String::new(),
}).await; }).await;
} }

View File

@@ -322,6 +322,16 @@ impl Database {
} }
} }
// Migration: add status_reason to workflows
let has_status_reason: bool = sqlx::query_scalar::<_, i32>(
"SELECT COUNT(*) FROM pragma_table_info('workflows') WHERE name='status_reason'"
).fetch_one(&self.pool).await.unwrap_or(0) > 0;
if !has_status_reason {
let _ = sqlx::query(
"ALTER TABLE workflows ADD COLUMN status_reason TEXT NOT NULL DEFAULT ''"
).execute(&self.pool).await;
}
Ok(()) Ok(())
} }
} }
@@ -348,6 +358,8 @@ pub struct Workflow {
pub created_at: String, pub created_at: String,
pub report: String, pub report: String,
pub template_id: String, pub template_id: String,
#[serde(default)]
pub status_reason: String,
} }
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)] #[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]

View File

@@ -14,12 +14,12 @@ use crate::state::{AgentState, Artifact};
#[serde(tag = "kind")] #[serde(tag = "kind")]
pub enum AgentUpdate { pub enum AgentUpdate {
PlanUpdate { workflow_id: String, steps: Vec<PlanStepInfo> }, PlanUpdate { workflow_id: String, steps: Vec<PlanStepInfo> },
WorkflowStatus { workflow_id: String, status: String }, WorkflowStatus { workflow_id: String, status: String, #[serde(default)] reason: String },
Activity { workflow_id: String, activity: String }, Activity { workflow_id: String, activity: String },
ExecutionLog { workflow_id: String, step_order: i32, tool_name: String, tool_input: String, output: String, status: String }, ExecutionLog { workflow_id: String, step_order: i32, tool_name: String, tool_input: String, output: String, status: String },
LlmCallLog { workflow_id: String, step_order: i32, phase: String, messages_count: i32, tools_count: i32, tool_calls: String, text_response: String, prompt_tokens: Option<u32>, completion_tokens: Option<u32>, latency_ms: i64 }, LlmCallLog { workflow_id: String, step_order: i32, phase: String, messages_count: i32, tools_count: i32, tool_calls: String, text_response: String, prompt_tokens: Option<u32>, completion_tokens: Option<u32>, latency_ms: i64 },
StateSnapshot { workflow_id: String, step_order: i32, state: AgentState }, StateSnapshot { workflow_id: String, step_order: i32, state: AgentState },
WorkflowComplete { workflow_id: String, status: String }, WorkflowComplete { workflow_id: String, status: String, #[serde(default)] reason: String },
ArtifactSave { workflow_id: String, step_order: i32, artifact: Artifact }, ArtifactSave { workflow_id: String, step_order: i32, artifact: Artifact },
RequirementUpdate { workflow_id: String, requirement: String }, RequirementUpdate { workflow_id: String, requirement: String },
/// base64-encoded file content /// base64-encoded file content
@@ -61,9 +61,9 @@ pub async fn handle_single_update(
AgentUpdate::PlanUpdate { workflow_id, steps } => { AgentUpdate::PlanUpdate { workflow_id, steps } => {
bcast(broadcast_tx, WsMessage::PlanUpdate { workflow_id: workflow_id.clone(), steps: steps.clone() }); bcast(broadcast_tx, WsMessage::PlanUpdate { workflow_id: workflow_id.clone(), steps: steps.clone() });
} }
AgentUpdate::WorkflowStatus { workflow_id, status } => { AgentUpdate::WorkflowStatus { workflow_id, status, reason } => {
let _ = sqlx::query("UPDATE workflows SET status = ? WHERE id = ?") let _ = sqlx::query("UPDATE workflows SET status = ?, status_reason = ? WHERE id = ?")
.bind(status).bind(workflow_id).execute(pool).await; .bind(status).bind(reason).bind(workflow_id).execute(pool).await;
bcast(broadcast_tx, WsMessage::WorkflowStatusUpdate { workflow_id: workflow_id.clone(), status: status.clone() }); bcast(broadcast_tx, WsMessage::WorkflowStatusUpdate { workflow_id: workflow_id.clone(), status: status.clone() });
} }
AgentUpdate::Activity { workflow_id, activity } => { AgentUpdate::Activity { workflow_id, activity } => {
@@ -100,9 +100,9 @@ pub async fn handle_single_update(
"INSERT INTO agent_state_snapshots (id, workflow_id, step_order, state_json, created_at) VALUES (?, ?, ?, ?, datetime('now'))" "INSERT INTO agent_state_snapshots (id, workflow_id, step_order, state_json, created_at) VALUES (?, ?, ?, ?, datetime('now'))"
).bind(&id).bind(workflow_id).bind(step_order).bind(&json).execute(pool).await; ).bind(&id).bind(workflow_id).bind(step_order).bind(&json).execute(pool).await;
} }
AgentUpdate::WorkflowComplete { workflow_id, status } => { AgentUpdate::WorkflowComplete { workflow_id, status, reason } => {
let _ = sqlx::query("UPDATE workflows SET status = ? WHERE id = ?") let _ = sqlx::query("UPDATE workflows SET status = ?, status_reason = ? WHERE id = ?")
.bind(status).bind(workflow_id).execute(pool).await; .bind(status).bind(reason).bind(workflow_id).execute(pool).await;
bcast(broadcast_tx, WsMessage::WorkflowStatusUpdate { workflow_id: workflow_id.clone(), status: status.clone() }); bcast(broadcast_tx, WsMessage::WorkflowStatusUpdate { workflow_id: workflow_id.clone(), status: status.clone() });
} }
AgentUpdate::ArtifactSave { workflow_id, step_order, artifact } => { AgentUpdate::ArtifactSave { workflow_id, step_order, artifact } => {

View File

@@ -163,4 +163,13 @@ impl WorkerManager {
pub async fn complete_workflow(&self, workflow_id: &str) { pub async fn complete_workflow(&self, workflow_id: &str) {
self.assignments.write().await.remove(workflow_id); self.assignments.write().await.remove(workflow_id);
} }
/// List all workflows assigned to a worker.
pub async fn assignments_for_worker(&self, worker_name: &str) -> Vec<String> {
self.assignments.read().await
.iter()
.filter(|(_, w)| w.as_str() == worker_name)
.map(|(wf_id, _)| wf_id.clone())
.collect()
}
} }

View File

@@ -194,9 +194,11 @@ async fn connect_and_run(server_url: &str, worker_name: &str, llm_config: &crate
// Sync all workspace files to server // Sync all workspace files to server
sync_workspace(&update_tx, &project_id, &workdir).await; sync_workspace(&update_tx, &project_id, &workdir).await;
let reason = if let Err(ref e) = result { format!("{}", e) } else { String::new() };
let _ = update_tx.send(AgentUpdate::WorkflowComplete { let _ = update_tx.send(AgentUpdate::WorkflowComplete {
workflow_id: workflow_id.clone(), workflow_id: workflow_id.clone(),
status: final_status.into(), status: final_status.into(),
reason,
}).await; }).await;
*comment_tx.lock().await = None; *comment_tx.lock().await = None;

View File

@@ -111,6 +111,23 @@ async fn handle_worker_socket(socket: WebSocket, state: Arc<WsWorkerState>) {
_ = recv_task => {}, _ = recv_task => {},
} }
// Log reason for any orphaned workflows before cleanup
let orphan_workflows: Vec<String> = {
let assignments = mgr_for_cleanup.assignments_for_worker(&name_clone).await;
assignments
};
if !orphan_workflows.is_empty() {
let reason = format!("Worker '{}' 断开连接", name_clone);
for wf_id in &orphan_workflows {
let _ = sqlx::query("UPDATE workflows SET status = 'failed', status_reason = ? WHERE id = ? AND status IN ('executing', 'planning')")
.bind(&reason).bind(wf_id).execute(&state.pool).await;
let log_id = uuid::Uuid::new_v4().to_string();
let _ = sqlx::query(
"INSERT INTO execution_log (id, workflow_id, step_order, tool_name, tool_input, output, status, created_at) VALUES (?, ?, 0, 'system', 'worker_disconnect', ?, 'failed', datetime('now'))"
).bind(&log_id).bind(wf_id).bind(&reason).execute(&state.pool).await;
tracing::warn!("Workflow {} orphaned: {}", wf_id, reason);
}
}
mgr_for_cleanup.unregister(&name_clone).await; mgr_for_cleanup.unregister(&name_clone).await;
} }