fix(worker): address concurrency, cancellation, and resource issues

- claude process: run stdout/stderr reads without ct; rely on
  kill-on-cancel closing the pipes to unblock them — previously
  ReadLineAsync(ct) could hang, stalling task slots and shutdown
- task runner: terminal db writes (task_runs, MarkDone, MarkFailed,
  SetLogPath) now use CancellationToken.None; RunOnceAsync catches
  OCE and finalizes the run row so ContinueAsync can resume
- task repository: GetNextQueuedAgentTaskAsync is now a single
  UPDATE ... RETURNING statement — closes TOCTOU window where two
  loop iterations could dispatch the same queued task
- queue service: dispose CancellationTokenSource in slot-completion
  ContinueWith to stop leaking wait handles
- git service: register ct.Kill(processTree), drain reads without ct,
  always reap via WaitForExitAsync(None) — no more git zombies on
  cancelled worktree ops
- worktree manager: branch name uses full task id (dashes stripped)
  instead of 8-char prefix, eliminating collision risk

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mika Kuns
2026-04-15 16:27:18 +02:00
parent fc9029de97
commit d3b85f2234
7 changed files with 122 additions and 57 deletions

View File

@@ -104,20 +104,34 @@ public sealed class GitService
using var proc = new Process { StartInfo = psi };
proc.Start();
// On cancellation: kill the git process tree. Killing closes the
// redirected pipes, which unblocks the ReadToEndAsync calls below
// and lets WaitForExitAsync return so the process is reaped.
// Without this, cancelling mid-git leaves zombie processes.
await using var ctr = ct.Register(() =>
{
try { proc.Kill(entireProcessTree: true); }
catch { /* already exited */ }
});
if (stdinData is not null)
{
await proc.StandardInput.WriteAsync(stdinData.AsMemory(), ct);
proc.StandardInput.Close();
}
var stdoutTask = proc.StandardOutput.ReadToEndAsync(ct);
var stderrTask = proc.StandardError.ReadToEndAsync(ct);
// Drain output without ct — pipes close when the process exits
// (whether naturally or via Kill above), so these always complete.
var stdoutTask = proc.StandardOutput.ReadToEndAsync();
var stderrTask = proc.StandardError.ReadToEndAsync();
await proc.WaitForExitAsync(ct);
await proc.WaitForExitAsync(CancellationToken.None);
var stdout = await stdoutTask;
var stderr = await stderrTask;
ct.ThrowIfCancellationRequested();
return (proc.ExitCode, stdout.TrimEnd(), stderr.TrimEnd());
}
}

View File

@@ -174,26 +174,36 @@ public sealed class TaskRepository
public async Task<TaskEntity?> GetNextQueuedAgentTaskAsync(DateTime now, CancellationToken ct = default)
{
// Atomically claim the next queued agent task: the UPDATE flips its
// status to 'running' in the same statement that returns its row,
// eliminating the TOCTOU gap where two queue-loop iterations could
// both select the same queued task before either marked it running.
// The caller is responsible for populating started_at shortly after.
await using var conn = _factory.Open();
await using var cmd = conn.CreateCommand();
cmd.CommandText = """
SELECT t.id, t.list_id, t.title, t.description, t.status, t.scheduled_for,
t.result, t.log_path, t.created_at, t.started_at, t.finished_at, t.commit_type,
t.model, t.system_prompt, t.agent_path
FROM tasks t
WHERE t.status = 'queued'
AND (t.scheduled_for IS NULL OR t.scheduled_for <= @now)
AND EXISTS (
SELECT 1 FROM task_tags tt
JOIN tags tg ON tg.id = tt.tag_id
WHERE tt.task_id = t.id AND tg.name = 'agent'
UNION
SELECT 1 FROM list_tags lt
JOIN tags tg ON tg.id = lt.tag_id
WHERE lt.list_id = t.list_id AND tg.name = 'agent'
)
ORDER BY t.created_at ASC
LIMIT 1
UPDATE tasks
SET status = 'running'
WHERE id = (
SELECT t.id
FROM tasks t
WHERE t.status = 'queued'
AND (t.scheduled_for IS NULL OR t.scheduled_for <= @now)
AND EXISTS (
SELECT 1 FROM task_tags tt
JOIN tags tg ON tg.id = tt.tag_id
WHERE tt.task_id = t.id AND tg.name = 'agent'
UNION
SELECT 1 FROM list_tags lt
JOIN tags tg ON tg.id = lt.tag_id
WHERE lt.list_id = t.list_id AND tg.name = 'agent'
)
ORDER BY t.created_at ASC
LIMIT 1
)
RETURNING id, list_id, title, description, status, scheduled_for,
result, log_path, created_at, started_at, finished_at, commit_type,
model, system_prompt, agent_path
""";
cmd.Parameters.AddWithValue("@now", now.ToString("o"));