fix(worker): address concurrency, cancellation, and resource issues
- claude process: run stdout/stderr reads without ct; rely on kill-on-cancel closing the pipes to unblock them — previously ReadLineAsync(ct) could hang, stalling task slots and shutdown - task runner: terminal db writes (task_runs, MarkDone, MarkFailed, SetLogPath) now use CancellationToken.None; RunOnceAsync catches OCE and finalizes the run row so ContinueAsync can resume - task repository: GetNextQueuedAgentTaskAsync is now a single UPDATE ... RETURNING statement — closes TOCTOU window where two loop iterations could dispatch the same queued task - queue service: dispose CancellationTokenSource in slot-completion ContinueWith to stop leaking wait handles - git service: register ct.Kill(processTree), drain reads without ct, always reap via WaitForExitAsync(None) — no more git zombies on cancelled worktree ops - worktree manager: branch name uses full task id (dashes stripped) instead of 8-char prefix, eliminating collision risk Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -232,33 +232,56 @@ public sealed class TaskRunner
|
||||
|
||||
await using var logWriter = new LogWriter(logPath);
|
||||
|
||||
var result = await _claude.RunAsync(
|
||||
arguments,
|
||||
prompt,
|
||||
runDir,
|
||||
async line =>
|
||||
try
|
||||
{
|
||||
var result = await _claude.RunAsync(
|
||||
arguments,
|
||||
prompt,
|
||||
runDir,
|
||||
async line =>
|
||||
{
|
||||
await logWriter.WriteLineAsync(line, ct);
|
||||
await _broadcaster.TaskMessage(taskId, line);
|
||||
},
|
||||
ct);
|
||||
|
||||
// Update the run record with results. Use CancellationToken.None:
|
||||
// this is a terminal write that must always complete, even if the
|
||||
// caller's token is already cancelled.
|
||||
run.SessionId = result.SessionId;
|
||||
run.ResultMarkdown = result.ResultMarkdown;
|
||||
run.StructuredOutputJson = result.StructuredOutputJson;
|
||||
run.ErrorMarkdown = result.ErrorMarkdown;
|
||||
run.ExitCode = result.ExitCode;
|
||||
run.TurnCount = result.TurnCount;
|
||||
run.TokensIn = result.TokensIn;
|
||||
run.TokensOut = result.TokensOut;
|
||||
run.FinishedAt = DateTime.UtcNow;
|
||||
await _runRepo.UpdateAsync(run, CancellationToken.None);
|
||||
|
||||
// Update denormalized fields on the task.
|
||||
await _taskRepo.SetLogPathAsync(taskId, logPath, CancellationToken.None);
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Ensure the run row is completed so ContinueAsync / inspection
|
||||
// isn't left staring at a null session_id / finished_at.
|
||||
run.ErrorMarkdown = "Cancelled.";
|
||||
run.ExitCode = -1;
|
||||
run.FinishedAt = DateTime.UtcNow;
|
||||
try
|
||||
{
|
||||
await logWriter.WriteLineAsync(line, ct);
|
||||
await _broadcaster.TaskMessage(taskId, line);
|
||||
},
|
||||
ct);
|
||||
|
||||
// Update the run record with results.
|
||||
run.SessionId = result.SessionId;
|
||||
run.ResultMarkdown = result.ResultMarkdown;
|
||||
run.StructuredOutputJson = result.StructuredOutputJson;
|
||||
run.ErrorMarkdown = result.ErrorMarkdown;
|
||||
run.ExitCode = result.ExitCode;
|
||||
run.TurnCount = result.TurnCount;
|
||||
run.TokensIn = result.TokensIn;
|
||||
run.TokensOut = result.TokensOut;
|
||||
run.FinishedAt = DateTime.UtcNow;
|
||||
await _runRepo.UpdateAsync(run, ct);
|
||||
|
||||
// Update denormalized fields on the task.
|
||||
await _taskRepo.SetLogPathAsync(taskId, logPath, ct);
|
||||
|
||||
return result;
|
||||
await _runRepo.UpdateAsync(run, CancellationToken.None);
|
||||
await _taskRepo.SetLogPathAsync(taskId, logPath, CancellationToken.None);
|
||||
}
|
||||
catch (Exception updateEx)
|
||||
{
|
||||
_logger.LogError(updateEx, "Failed to finalize cancelled run {RunId} for task {TaskId}", runId, taskId);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleSuccess(TaskEntity task, ListEntity list, string slot, WorktreeContext? wtCtx, RunResult result, CancellationToken ct)
|
||||
@@ -270,8 +293,11 @@ public sealed class TaskRunner
|
||||
await _broadcaster.WorktreeUpdated(task.Id);
|
||||
}
|
||||
|
||||
// Terminal DB write uses CancellationToken.None so the task status
|
||||
// is never left as 'running' because of a cancel that arrived
|
||||
// after the Claude run already succeeded.
|
||||
var finishedAt = DateTime.UtcNow;
|
||||
await _taskRepo.MarkDoneAsync(task.Id, finishedAt, result.ResultMarkdown, ct);
|
||||
await _taskRepo.MarkDoneAsync(task.Id, finishedAt, result.ResultMarkdown, CancellationToken.None);
|
||||
await _broadcaster.TaskFinished(slot, task.Id, "done", finishedAt);
|
||||
_logger.LogInformation("Task {TaskId} completed (turns={Turns}, tokens_in={In}, tokens_out={Out})",
|
||||
task.Id, result.TurnCount, result.TokensIn, result.TokensOut);
|
||||
@@ -279,8 +305,10 @@ public sealed class TaskRunner
|
||||
|
||||
private async Task HandleFailure(string taskId, string slot, RunResult result)
|
||||
{
|
||||
// Intentionally does not accept a CancellationToken: this is the
|
||||
// terminal write for a failed task and must always be persisted.
|
||||
var finishedAt = DateTime.UtcNow;
|
||||
await _taskRepo.MarkFailedAsync(taskId, finishedAt, result.ErrorMarkdown);
|
||||
await _taskRepo.MarkFailedAsync(taskId, finishedAt, result.ErrorMarkdown, CancellationToken.None);
|
||||
await _broadcaster.TaskFinished(slot, taskId, "failed", finishedAt);
|
||||
_logger.LogWarning("Task {TaskId} failed (turns={Turns}): {Error}", taskId, result.TurnCount, result.ErrorMarkdown);
|
||||
}
|
||||
@@ -290,7 +318,8 @@ public sealed class TaskRunner
|
||||
try
|
||||
{
|
||||
var now = DateTime.UtcNow;
|
||||
await _taskRepo.MarkFailedAsync(taskId, now, error);
|
||||
// Terminal write — never cancel.
|
||||
await _taskRepo.MarkFailedAsync(taskId, now, error, CancellationToken.None);
|
||||
await _broadcaster.TaskFinished(slot, taskId, "failed", now);
|
||||
await _broadcaster.TaskUpdated(taskId);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user