fix: robust worker interaction

- Text heartbeat every 30s (not ping frames) to survive Cloudflare/proxy
- Comment on completed/failed workflow re-dispatches to worker with state resume
- Dispatch retries 3 times with 5s delay (survives reconnect window)
This commit is contained in:
2026-04-06 21:50:56 +01:00
parent c6c03224b1
commit 409bc89284
3 changed files with 83 additions and 11 deletions

View File

@@ -153,7 +153,16 @@ impl AgentManager {
require_plan_approval: false,
};
match self.worker_mgr.assign_workflow(assign).await {
// Retry dispatch up to 3 times (worker might be reconnecting)
let mut dispatch_result = self.worker_mgr.assign_workflow(assign.clone()).await;
for attempt in 1..3 {
if dispatch_result.is_ok() { break; }
tracing::warn!("Dispatch attempt {} failed, retrying in 5s...", attempt);
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
dispatch_result = self.worker_mgr.assign_workflow(assign.clone()).await;
}
match dispatch_result {
Ok(name) => {
tracing::info!("Workflow {} dispatched to worker '{}'", workflow_id, name);
}
@@ -177,8 +186,71 @@ impl AgentManager {
}
}
AgentEvent::Comment { workflow_id, content } => {
if let Err(e) = self.worker_mgr.forward_comment(&workflow_id, &content).await {
tracing::warn!("Failed to forward comment for workflow {}: {}", workflow_id, e);
// Try to forward to running worker first
if let Err(_) = self.worker_mgr.forward_comment(&workflow_id, &content).await {
// No worker handling this workflow — re-dispatch it
// Load workflow to get project_id and requirement
let wf = sqlx::query_as::<_, crate::db::Workflow>(
"SELECT * FROM workflows WHERE id = ?"
).bind(&workflow_id).fetch_optional(&self.pool).await.ok().flatten();
if let Some(wf) = wf {
tracing::info!("Re-dispatching workflow {} with comment", workflow_id);
// Load latest state for resume
let state_json: Option<String> = sqlx::query_scalar(
"SELECT state_json FROM agent_state_snapshots WHERE workflow_id = ? ORDER BY created_at DESC LIMIT 1"
).bind(&workflow_id).fetch_optional(&self.pool).await.ok().flatten();
let mut initial_state = state_json
.and_then(|json| serde_json::from_str::<crate::state::AgentState>(&json).ok());
// Attach comment as user feedback + reset failed/waiting steps
if let Some(ref mut state) = initial_state {
for step in &mut state.steps {
if matches!(step.status, crate::state::StepStatus::Failed) {
step.status = crate::state::StepStatus::Pending;
}
if matches!(step.status, crate::state::StepStatus::WaitingUser) {
step.status = crate::state::StepStatus::Running;
}
}
if let Some(order) = state.first_actionable_step() {
if let Some(step) = state.steps.iter_mut().find(|s| s.order == order) {
step.user_feedbacks.push(content);
}
}
}
let assign = crate::worker::ServerToWorker::WorkflowAssign {
workflow_id: workflow_id.clone(),
project_id: wf.project_id.clone(),
requirement: wf.requirement,
template_id: if wf.template_id.is_empty() { None } else { Some(wf.template_id) },
initial_state,
require_plan_approval: false,
};
let btx = {
let mut map = self.broadcast.write().await;
map.entry(wf.project_id.clone())
.or_insert_with(|| broadcast::channel(64).0)
.clone()
};
match self.worker_mgr.assign_workflow(assign).await {
Ok(name) => {
let _ = sqlx::query("UPDATE workflows SET status = 'executing', status_reason = '' WHERE id = ?")
.bind(&workflow_id).execute(&self.pool).await;
let _ = btx.send(WsMessage::WorkflowStatusUpdate {
workflow_id, status: "executing".into(),
});
tracing::info!("Workflow re-dispatched to worker '{}'", name);
}
Err(e) => {
tracing::error!("Failed to re-dispatch workflow {}: {}", workflow_id, e);
}
}
}
}
}
}