refactor(worker/state): introduce TaskStateService and route mutations through it

Slice 2 of the worker state consolidation refactor (spec sections 2 and 8).

Adds Worker/State/ITaskStateService + TaskStateService as the single component
that mutates Status, PlanningPhase, and BlockedByTaskId. Each transition is one
atomic ExecuteUpdate with a WHERE filter on the expected source status, so
parallel claims are TOCTOU-free. Side effects (queue wake on -> Queued, hub
TaskUpdated broadcast, chain advance + parent completion on terminal child)
are owned by the service so callers no longer need to remember them.

Migrated callers (mechanical, behavior preserved):
- TaskRunner: HandleSuccess/HandleFailure/MarkFailed/RunAsync/ContinueAsync
- StaleTaskRecovery: bulk recover stale Running tasks
- TaskResetService: status flip (worktree cleanup stays in service)
- PlanningSessionManager.StartAsync: status flip via state, token write via repo
- PlanningChainCoordinator.OnChildFinishedAsync: routes the next-sibling write
  through state.UnblockAsync (Slice 4 finishes the rewrite)
- ExternalMcpService.UpdateTaskStatus: Queued case via state.EnqueueAsync

Repo Mark*Async helpers (MarkRunning/MarkDone/MarkFailed/FlipAllRunningToFailed)
are now internal; ClaudeDo.Data grants InternalsVisibleTo to ClaudeDo.Worker
and ClaudeDo.Worker.Tests for the existing repo-level tests.

DI: TaskStateService is registered as Singleton in both the main app and the
external-MCP app; the queue-wake delegate captures sp -> QueueService.WakeQueue
to break the TaskStateService -> QueueService -> TaskRunner -> TaskStateService
construction cycle. PlanningChainCoordinator takes Func<ITaskStateService> for
the same reason; Slice 3 will replace both with IQueueWaker.

Tests: TaskStateServiceTests covers happy + reject for every transition, the
parallel StartRunningAsync claim race, child-terminal chain advancement, and
stale recovery. Existing service/repo tests are updated to construct the new
state-service via a TaskStateServiceBuilder helper. Pre-existing constructor
drift in QueueService/ExternalMcp/PlanningHub tests is patched to keep the
test project building (the surrounding test logic is otherwise untouched).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mika Kuns
2026-04-27 11:31:57 +02:00
parent cf7a6e413c
commit 8823265e5a
22 changed files with 845 additions and 91 deletions

View File

@@ -3,7 +3,7 @@ using ClaudeDo.Data.Models;
using ClaudeDo.Data.Repositories;
using ClaudeDo.Worker.Config;
using ClaudeDo.Worker.Hub;
using ClaudeDo.Worker.Planning;
using ClaudeDo.Worker.State;
using Microsoft.EntityFrameworkCore;
using TaskStatus = ClaudeDo.Data.Models.TaskStatus;
@@ -18,7 +18,7 @@ public sealed class TaskRunner
private readonly ClaudeArgsBuilder _argsBuilder;
private readonly WorkerConfig _cfg;
private readonly ILogger<TaskRunner> _logger;
private readonly PlanningChainCoordinator _chain;
private readonly ITaskStateService _state;
public TaskRunner(
IClaudeProcess claude,
@@ -28,7 +28,7 @@ public sealed class TaskRunner
ClaudeArgsBuilder argsBuilder,
WorkerConfig cfg,
ILogger<TaskRunner> logger,
PlanningChainCoordinator chain)
ITaskStateService state)
{
_claude = claude;
_dbFactory = dbFactory;
@@ -37,7 +37,7 @@ public sealed class TaskRunner
_argsBuilder = argsBuilder;
_cfg = cfg;
_logger = logger;
_chain = chain;
_state = state;
}
public async Task RunAsync(TaskEntity task, string slot, CancellationToken ct)
@@ -91,11 +91,7 @@ public sealed class TaskRunner
var resolvedConfig = await ResolveConfigAsync(task, listConfig, null, ct);
var now = DateTime.UtcNow;
using (var context = _dbFactory.CreateDbContext())
{
var taskRepo = new TaskRepository(context);
await taskRepo.MarkRunningAsync(task.Id, now, ct);
}
await _state.StartRunningAsync(task.Id, now, ct);
await _broadcaster.TaskStarted(slot, task.Id, now);
// Build prompt.
@@ -202,11 +198,7 @@ public sealed class TaskRunner
}
var now = DateTime.UtcNow;
using (var context = _dbFactory.CreateDbContext())
{
var taskRepo = new TaskRepository(context);
await taskRepo.MarkRunningAsync(taskId, now, ct);
}
await _state.StartRunningAsync(taskId, now, ct);
await _broadcaster.TaskStarted(slot, taskId, now);
var nextRunNumber = lastRun.RunNumber + 1;
@@ -332,34 +324,11 @@ public sealed class TaskRunner
// is never left as 'running' because of a cancel that arrived
// after the Claude run already succeeded.
var finishedAt = DateTime.UtcNow;
using (var context = _dbFactory.CreateDbContext())
{
var taskRepo = new TaskRepository(context);
await taskRepo.MarkDoneAsync(task.Id, finishedAt, result.ResultMarkdown, CancellationToken.None);
if (task.ParentTaskId is not null)
await taskRepo.TryCompleteParentAsync(task.ParentTaskId, CancellationToken.None);
}
await _state.CompleteAsync(task.Id, finishedAt, result.ResultMarkdown, CancellationToken.None);
await _broadcaster.WorkerLog($"Finished \"{task.Title}\" (done)", WorkerLogLevel.Success, DateTime.UtcNow);
await _broadcaster.TaskFinished(slot, task.Id, "done", finishedAt);
_logger.LogInformation("Task {TaskId} completed (turns={Turns}, tokens_in={In}, tokens_out={Out})",
task.Id, result.TurnCount, result.TokensIn, result.TokensOut);
// Sequential planning chain: if this task has a parent, flip the next
// Waiting sibling to Queued so the queue pickup loop dispatches it next.
if (task.ParentTaskId is not null)
{
try
{
var advanced = await _chain.OnChildFinishedAsync(
task.Id, TaskStatus.Done, CancellationToken.None);
if (advanced is not null)
await _broadcaster.TaskUpdated(advanced);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "PlanningChain advance failed for {TaskId}", task.Id);
}
}
}
private async Task HandleFailure(string taskId, string taskTitle, string slot, RunResult result)
@@ -367,12 +336,7 @@ public sealed class TaskRunner
// Intentionally does not accept a CancellationToken: this is the
// terminal write for a failed task and must always be persisted.
var finishedAt = DateTime.UtcNow;
using var context = _dbFactory.CreateDbContext();
var taskRepo = new TaskRepository(context);
await taskRepo.MarkFailedAsync(taskId, finishedAt, result.ErrorMarkdown, CancellationToken.None);
var justFailed = await taskRepo.GetByIdAsync(taskId, CancellationToken.None);
if (justFailed?.ParentTaskId is not null)
await taskRepo.TryCompleteParentAsync(justFailed.ParentTaskId, CancellationToken.None);
await _state.FailAsync(taskId, finishedAt, result.ErrorMarkdown, CancellationToken.None);
await _broadcaster.WorkerLog($"Finished \"{taskTitle}\" (failed)", WorkerLogLevel.Error, DateTime.UtcNow);
await _broadcaster.TaskFinished(slot, taskId, "failed", finishedAt);
_logger.LogWarning("Task {TaskId} failed (turns={Turns}): {Error}", taskId, result.TurnCount, result.ErrorMarkdown);
@@ -384,15 +348,9 @@ public sealed class TaskRunner
{
var now = DateTime.UtcNow;
// Terminal write — never cancel.
using var context = _dbFactory.CreateDbContext();
var taskRepo = new TaskRepository(context);
await taskRepo.MarkFailedAsync(taskId, now, error, CancellationToken.None);
var justFailed = await taskRepo.GetByIdAsync(taskId, CancellationToken.None);
if (justFailed?.ParentTaskId is not null)
await taskRepo.TryCompleteParentAsync(justFailed.ParentTaskId, CancellationToken.None);
await _state.FailAsync(taskId, now, error, CancellationToken.None);
await _broadcaster.WorkerLog($"Finished \"{taskTitle}\" (failed)", WorkerLogLevel.Error, DateTime.UtcNow);
await _broadcaster.TaskFinished(slot, taskId, "failed", now);
await _broadcaster.TaskUpdated(taskId);
}
catch (Exception ex)
{