From 71ac48162a919915f86364529a40e5f1571c9e8c Mon Sep 17 00:00:00 2001 From: mika kuns Date: Thu, 4 Jun 2026 11:21:40 +0200 Subject: [PATCH] fix(worker): clean up orphaned worktree when the DB row insert fails If WorktreeAddAsync succeeds but the worktrees-row insert throws, the worktree was left on disk and branch undeleted with nothing tracking it. Wrap the insert in try/catch and best-effort remove the worktree+branch (non-cancellable) before rethrowing. Co-Authored-By: Claude Opus 4.7 --- docs/open.md | 13 +++--- src/ClaudeDo.Worker/Runner/WorktreeManager.cs | 44 +++++++++++++------ .../Runner/WorktreeManagerTests.cs | 31 +++++++++++++ 3 files changed, 67 insertions(+), 21 deletions(-) diff --git a/docs/open.md b/docs/open.md index 6062d01..0392c5c 100644 --- a/docs/open.md +++ b/docs/open.md @@ -31,14 +31,14 @@ Weitere Specs/Plans seit 2026-04-30 (für die Historie): `planning-draft-planned Der Code ist da; was fehlt, ist die **manuelle Bestätigung mit explizit notiertem Pass-Kriterium**. Ohne falsifizierbare Observable produziert ein Manual-Run nur „sah ok aus". -### 1.0 Plan-Verification — Kernpipeline (unverändert dringend) +### 1.0 Plan-Verification — Kernpipeline -Die End-to-End-Beweise für die eigentliche Ausführungspipeline wurden **nie durchgespielt**. Das ist das größte verbleibende Risiko: alle neuen Features sitzen auf einer Pipeline, deren Happy-Path nie unter echtem `claude` verifiziert wurde. +**2026-06-04: Kernpipeline läuft.** Der User bestätigt, dass Tasks mit echtem `claude` gestartet werden und der Happy-Path (non-worktree) durchläuft — Live-Stream und Wake-up inklusive. Die granularen Observables unten wurden nicht formell protokolliert; offen bleiben die worktree-spezifischen Pfade (5–7). | # | Item | Status | Pass-Kriterium | |---|------|---|---| -| 3 | Smoke-Spawn (`claude -p` „ping") | ⬜ | `task_runs`-Row mit `session_id NOT NULL`, `result` non-empty, `output_tokens > 0` | -| 4 | E2E Happy Path (Non-Worktree) | ⬜ | Liste anlegen → Task → `tasks.status='Done'`, `result IS NOT NULL`, Logfile unter `~/.todo-app/logs/.ndjson`, UI-Row mit Done-Badge | +| 3 | Smoke-Spawn (`claude -p` „ping") | ✅ | (in der Praxis bestätigt 2026-06-04; formale Observables nicht protokolliert) | +| 4 | E2E Happy Path (Non-Worktree) | ✅ | (in der Praxis bestätigt 2026-06-04) | | 5 | Worktree Happy Path | ⬜ | Liste mit `working_dir` auf temp-Repo, Task mit Codeänderung → `worktrees.state='active'`, `head_commit IS NOT NULL`, `diff_stat` non-empty, Branch `claudedo/` auf Disk | | 6 | No-Changes-Run | ⬜ | Prompt der nichts ändert → `status='Done'`, `worktrees.head_commit IS NULL`, `diff_stat IS NULL` | | 7 | Kein Git-Repo | ⬜ | `working_dir=C:\Temp` → `status='Failed'`, **keine** `worktrees`-Row, Git-Fehler im Log | @@ -111,10 +111,9 @@ Voraussetzung: Gitea-Release unter `git.kuns.dev/releases/ClaudeDo` mit `ClaudeD ## 3. Worker-Robustheit -### 3.1 Worktree-Cleanup bei Anlage-Failed 🟡 +### 3.1 Worktree-Cleanup bei Anlage-Failed ✅ - `WorktreeManager.cs:64-92` heilt eine „branch already exists"-Kollision (remove + prune + delete + retry). -- **Offen:** Es gibt **kein** `try/finally`, das `git worktree remove --force` ausführt, wenn `WorktreeAddAsync` erfolgreich war, aber der anschließende DB-Insert (`:95-109`) wirft. Genau dieser Crash-Pfad ist weiterhin ungeschützt. -- **Aufwand:** klein. +- Der DB-Insert (`:96-122`) ist jetzt in `try/catch` gekapselt: schlägt er fehl, nachdem `WorktreeAddAsync` erfolgreich war, wird der verwaiste Worktree + Branch best-effort (non-cancellable) entfernt und die Exception rethrown. Test: `WorktreeManagerTests.CreateAsync_DbInsertFails_RemovesOrphanedWorktreeAndBranch`. ### 3.2 CLI-Preflight beim Worker-Start ✅ - `Worker/Lifecycle/ClaudeCliPreflight.cs` + Wiring in `Program.cs`; skippable via `CLAUDEDO_SKIP_CLI_PREFLIGHT=1`; Tests vorhanden. diff --git a/src/ClaudeDo.Worker/Runner/WorktreeManager.cs b/src/ClaudeDo.Worker/Runner/WorktreeManager.cs index e0c9561..9f1fa1c 100644 --- a/src/ClaudeDo.Worker/Runner/WorktreeManager.cs +++ b/src/ClaudeDo.Worker/Runner/WorktreeManager.cs @@ -92,21 +92,37 @@ public sealed class WorktreeManager } // Insert worktrees row AFTER git succeeds — if git throws, no row is created. - using var context = _dbFactory.CreateDbContext(); - var wtRepo = new WorktreeRepository(context); - // Drop any stale row from a prior run (force-remove may have left the DB side behind). - await wtRepo.DeleteAsync(task.Id, ct); - await wtRepo.AddAsync(new WorktreeEntity + // If the insert itself fails, the worktree is already on disk with nothing + // tracking it: remove it (best-effort, non-cancellable) before rethrowing. + try { - TaskId = task.Id, - Path = worktreePath, - BranchName = branchName, - BaseCommit = baseCommit, - HeadCommit = null, - DiffStat = null, - State = WorktreeState.Active, - CreatedAt = DateTime.UtcNow, - }, ct); + using var context = _dbFactory.CreateDbContext(); + var wtRepo = new WorktreeRepository(context); + // Drop any stale row from a prior run (force-remove may have left the DB side behind). + await wtRepo.DeleteAsync(task.Id, ct); + await wtRepo.AddAsync(new WorktreeEntity + { + TaskId = task.Id, + Path = worktreePath, + BranchName = branchName, + BaseCommit = baseCommit, + HeadCommit = null, + DiffStat = null, + State = WorktreeState.Active, + CreatedAt = DateTime.UtcNow, + }, ct); + } + catch (Exception dbEx) + { + _logger.LogError(dbEx, + "Failed to record worktree row for task {TaskId}; removing orphaned worktree at {Path}", + task.Id, worktreePath); + try { await _git.WorktreeRemoveAsync(workingDir, worktreePath, force: true, CancellationToken.None); } + catch (Exception rmEx) { _logger.LogWarning(rmEx, "Failed to remove orphaned worktree at {Path}", worktreePath); } + try { await _git.BranchDeleteAsync(workingDir, branchName, force: true, CancellationToken.None); } + catch (Exception delEx) { _logger.LogWarning(delEx, "Failed to delete orphaned branch {Branch}", branchName); } + throw; + } _logger.LogInformation("Created worktree for task {TaskId} at {Path} (branch {Branch}, base {Base})", task.Id, worktreePath, branchName, baseCommit); diff --git a/tests/ClaudeDo.Worker.Tests/Runner/WorktreeManagerTests.cs b/tests/ClaudeDo.Worker.Tests/Runner/WorktreeManagerTests.cs index efc250d..ff72d81 100644 --- a/tests/ClaudeDo.Worker.Tests/Runner/WorktreeManagerTests.cs +++ b/tests/ClaudeDo.Worker.Tests/Runner/WorktreeManagerTests.cs @@ -158,6 +158,37 @@ public class WorktreeManagerTests : IDisposable Assert.Null(row); } + [Fact] + public async Task CreateAsync_DbInsertFails_RemovesOrphanedWorktreeAndBranch() + { + if (!GitAvailable) { Assert.True(true, "git not available -- skipping"); return; } + + var repo = CreateRepo(); + var (task, list) = MakeEntities(repo.RepoDir); + + // Seed the list but NOT the task: the worktrees-row insert references + // tasks(task_id) and fails the FK after `git worktree add` has succeeded. + var db = new DbFixture(); + _dbFixtures.Add(db); + using (var seedCtx = db.CreateContext()) + await new ListRepository(seedCtx).AddAsync(list); + + var cfg = new WorkerConfig { WorktreeRootStrategy = "sibling" }; + var mgr = new WorktreeManager( + new GitService(), db.CreateFactory(), cfg, NullLogger.Instance); + + await Assert.ThrowsAnyAsync( + () => mgr.CreateAsync(task, list, CancellationToken.None)); + + var branchName = $"claudedo/{task.Id.Replace("-", "")}"; + var branchList = GitRepoFixture.RunGit(repo.RepoDir, "branch", "--list", branchName); + Assert.True(string.IsNullOrWhiteSpace(branchList), + $"orphaned branch {branchName} should be cleaned up, got: {branchList}"); + + var worktreeList = GitRepoFixture.RunGit(repo.RepoDir, "worktree", "list"); + Assert.DoesNotContain(task.Id, worktreeList); + } + private static (TaskEntity task, ListEntity list) MakeEntities(string workingDir) { var listId = Guid.NewGuid().ToString();