From faf6104645862f90fcff8a0254211c5426c8ded9 Mon Sep 17 00:00:00 2001 From: Mika Kuns Date: Fri, 26 Jun 2026 14:34:31 +0200 Subject: [PATCH] fix(worker): kill spawned claude trees when the worker dies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spawned claude processes were only torn down on graceful cancellation (Process.Kill(entireProcessTree)). A hard worker death — Task Manager End Task, crash, OS restart, installer update — ran no cleanup, orphaning the claude->node->conhost tree, which lingered and piled up across restarts. Assign every spawned claude process to a Windows Job Object with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. The worker holds the only handle, so when it terminates for any reason the OS tears down the whole job. No-op on non-Windows; best-effort with a one-time warning if the Win32 calls fail. --- src/ClaudeDo.Worker/Runner/ClaudeProcess.cs | 1 + .../Runner/ProcessClaudeStreamTransport.cs | 1 + .../Runner/ProcessJobObject.cs | 157 ++++++++++++++++++ 3 files changed, 159 insertions(+) create mode 100644 src/ClaudeDo.Worker/Runner/ProcessJobObject.cs diff --git a/src/ClaudeDo.Worker/Runner/ClaudeProcess.cs b/src/ClaudeDo.Worker/Runner/ClaudeProcess.cs index 9f03d86..76bf979 100644 --- a/src/ClaudeDo.Worker/Runner/ClaudeProcess.cs +++ b/src/ClaudeDo.Worker/Runner/ClaudeProcess.cs @@ -46,6 +46,7 @@ public sealed class ClaudeProcess : IClaudeProcess using var process = new Process { StartInfo = psi }; process.Start(); + ProcessJobObject.Assign(process, _logger); await process.StandardInput.WriteAsync(prompt); process.StandardInput.Close(); diff --git a/src/ClaudeDo.Worker/Runner/ProcessClaudeStreamTransport.cs b/src/ClaudeDo.Worker/Runner/ProcessClaudeStreamTransport.cs index ae888ee..10f5717 100644 --- a/src/ClaudeDo.Worker/Runner/ProcessClaudeStreamTransport.cs +++ b/src/ClaudeDo.Worker/Runner/ProcessClaudeStreamTransport.cs @@ -45,6 +45,7 @@ public sealed class ProcessClaudeStreamTransport : IClaudeStreamTransport _process = new Process { StartInfo = psi }; _process.Start(); + ProcessJobObject.Assign(_process, _logger); // Keep stdin open — turns are driven by WriteLineAsync calls. _process.StandardInput.AutoFlush = false; diff --git a/src/ClaudeDo.Worker/Runner/ProcessJobObject.cs b/src/ClaudeDo.Worker/Runner/ProcessJobObject.cs new file mode 100644 index 0000000..903ead7 --- /dev/null +++ b/src/ClaudeDo.Worker/Runner/ProcessJobObject.cs @@ -0,0 +1,157 @@ +using System.Diagnostics; +using System.Runtime.InteropServices; +using System.Runtime.Versioning; + +namespace ClaudeDo.Worker.Runner; + +// Process leak guard (Windows). The worker spawns `claude` (which spawns node + conhost). +// Cancellation kills the tree via Process.Kill(entireProcessTree: true), but a *hard* worker +// death — Task Manager End Task, crash, OS restart, installer update — runs no cleanup, so the +// whole claude→node→conhost tree is orphaned and lingers. Over many restarts these pile up. +// +// Fix: one Job Object with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, created once and held for the +// worker's lifetime. Every spawned claude process is assigned to it. The worker holds the only +// handle to the job; when the worker process terminates for ANY reason the OS closes that handle +// and tears down every process still in the job. This is the only mechanism that survives a hard +// kill — which entireProcessTree cannot. +internal static class ProcessJobObject +{ + private const int JobObjectExtendedLimitInformation = 9; + private const uint JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x2000; + + private static readonly object _gate = new(); + private static IntPtr _job = IntPtr.Zero; // held for process lifetime, never closed + private static bool _disabled; + private static bool _warned; + + public static void Assign(Process process, ILogger logger) + { + if (!OperatingSystem.IsWindows() || _disabled) return; + + lock (_gate) + { + if (_disabled) return; + + if (_job == IntPtr.Zero) + { + _job = TryCreateJob(logger); + if (_job == IntPtr.Zero) + { + _disabled = true; + return; + } + } + + try + { + if (!AssignProcessToJobObject(_job, process.Handle)) + WarnOnce(logger, "AssignProcessToJobObject failed", Marshal.GetLastWin32Error()); + } + catch (Exception ex) + { + // process may have already exited, or handle is unavailable — best effort. + WarnOnce(logger, $"could not assign process to job: {ex.Message}", 0); + } + } + } + + [SupportedOSPlatform("windows")] + private static IntPtr TryCreateJob(ILogger logger) + { + var job = CreateJobObject(IntPtr.Zero, null); + if (job == IntPtr.Zero) + { + WarnOnce(logger, "CreateJobObject failed", Marshal.GetLastWin32Error()); + return IntPtr.Zero; + } + + var info = new JOBOBJECT_EXTENDED_LIMIT_INFORMATION + { + BasicLimitInformation = new JOBOBJECT_BASIC_LIMIT_INFORMATION + { + LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, + }, + }; + + var length = Marshal.SizeOf(); + var ptr = Marshal.AllocHGlobal(length); + try + { + Marshal.StructureToPtr(info, ptr, false); + if (!SetInformationJobObject(job, JobObjectExtendedLimitInformation, ptr, (uint)length)) + { + WarnOnce(logger, "SetInformationJobObject failed", Marshal.GetLastWin32Error()); + CloseHandle(job); + return IntPtr.Zero; + } + } + finally + { + Marshal.FreeHGlobal(ptr); + } + + logger.LogInformation("Process job object created — spawned claude processes will be killed if the worker dies."); + return job; + } + + private static void WarnOnce(ILogger logger, string message, int win32Error) + { + if (_warned) return; + _warned = true; + logger.LogWarning( + "Process leak guard inactive: {Message} (win32={Win32Error}). Orphaned claude processes may survive a hard worker kill.", + message, win32Error); + } + + [DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)] + private static extern IntPtr CreateJobObject(IntPtr lpJobAttributes, string? lpName); + + [DllImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + private static extern bool SetInformationJobObject( + IntPtr hJob, int jobObjectInfoClass, IntPtr lpJobObjectInfo, uint cbJobObjectInfoLength); + + [DllImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + private static extern bool AssignProcessToJobObject(IntPtr hJob, IntPtr hProcess); + + [DllImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + private static extern bool CloseHandle(IntPtr hObject); + + [StructLayout(LayoutKind.Sequential)] + private struct JOBOBJECT_BASIC_LIMIT_INFORMATION + { + public long PerProcessUserTimeLimit; + public long PerJobUserTimeLimit; + public uint LimitFlags; + public UIntPtr MinimumWorkingSetSize; + public UIntPtr MaximumWorkingSetSize; + public uint ActiveProcessLimit; + public UIntPtr Affinity; + public uint PriorityClass; + public uint SchedulingClass; + } + + [StructLayout(LayoutKind.Sequential)] + private struct IO_COUNTERS + { + public ulong ReadOperationCount; + public ulong WriteOperationCount; + public ulong OtherOperationCount; + public ulong ReadTransferCount; + public ulong WriteTransferCount; + public ulong OtherTransferCount; + } + + [StructLayout(LayoutKind.Sequential)] + private struct JOBOBJECT_EXTENDED_LIMIT_INFORMATION + { + public JOBOBJECT_BASIC_LIMIT_INFORMATION BasicLimitInformation; + public IO_COUNTERS IoInfo; + public UIntPtr ProcessMemoryLimit; + public UIntPtr JobMemoryLimit; + public UIntPtr PeakProcessMemoryUsed; + public UIntPtr PeakJobMemoryUsed; + } +}