fix(worker): kill spawned claude trees when the worker dies

Spawned claude processes were only torn down on graceful cancellation
(Process.Kill(entireProcessTree)). A hard worker death — Task Manager
End Task, crash, OS restart, installer update — ran no cleanup, orphaning
the claude->node->conhost tree, which lingered and piled up across
restarts.

Assign every spawned claude process to a Windows Job Object with
JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. The worker holds the only handle, so
when it terminates for any reason the OS tears down the whole job. No-op
on non-Windows; best-effort with a one-time warning if the Win32 calls
fail.
This commit is contained in:
Mika Kuns
2026-06-26 14:34:31 +02:00
parent 3eea2b7c96
commit faf6104645
3 changed files with 159 additions and 0 deletions

View File

@@ -46,6 +46,7 @@ public sealed class ClaudeProcess : IClaudeProcess
using var process = new Process { StartInfo = psi }; using var process = new Process { StartInfo = psi };
process.Start(); process.Start();
ProcessJobObject.Assign(process, _logger);
await process.StandardInput.WriteAsync(prompt); await process.StandardInput.WriteAsync(prompt);
process.StandardInput.Close(); process.StandardInput.Close();

View File

@@ -45,6 +45,7 @@ public sealed class ProcessClaudeStreamTransport : IClaudeStreamTransport
_process = new Process { StartInfo = psi }; _process = new Process { StartInfo = psi };
_process.Start(); _process.Start();
ProcessJobObject.Assign(_process, _logger);
// Keep stdin open — turns are driven by WriteLineAsync calls. // Keep stdin open — turns are driven by WriteLineAsync calls.
_process.StandardInput.AutoFlush = false; _process.StandardInput.AutoFlush = false;

View File

@@ -0,0 +1,157 @@
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Runtime.Versioning;
namespace ClaudeDo.Worker.Runner;
// Process leak guard (Windows). The worker spawns `claude` (which spawns node + conhost).
// Cancellation kills the tree via Process.Kill(entireProcessTree: true), but a *hard* worker
// death — Task Manager End Task, crash, OS restart, installer update — runs no cleanup, so the
// whole claude→node→conhost tree is orphaned and lingers. Over many restarts these pile up.
//
// Fix: one Job Object with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, created once and held for the
// worker's lifetime. Every spawned claude process is assigned to it. The worker holds the only
// handle to the job; when the worker process terminates for ANY reason the OS closes that handle
// and tears down every process still in the job. This is the only mechanism that survives a hard
// kill — which entireProcessTree cannot.
internal static class ProcessJobObject
{
private const int JobObjectExtendedLimitInformation = 9;
private const uint JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x2000;
private static readonly object _gate = new();
private static IntPtr _job = IntPtr.Zero; // held for process lifetime, never closed
private static bool _disabled;
private static bool _warned;
public static void Assign(Process process, ILogger logger)
{
if (!OperatingSystem.IsWindows() || _disabled) return;
lock (_gate)
{
if (_disabled) return;
if (_job == IntPtr.Zero)
{
_job = TryCreateJob(logger);
if (_job == IntPtr.Zero)
{
_disabled = true;
return;
}
}
try
{
if (!AssignProcessToJobObject(_job, process.Handle))
WarnOnce(logger, "AssignProcessToJobObject failed", Marshal.GetLastWin32Error());
}
catch (Exception ex)
{
// process may have already exited, or handle is unavailable — best effort.
WarnOnce(logger, $"could not assign process to job: {ex.Message}", 0);
}
}
}
[SupportedOSPlatform("windows")]
private static IntPtr TryCreateJob(ILogger logger)
{
var job = CreateJobObject(IntPtr.Zero, null);
if (job == IntPtr.Zero)
{
WarnOnce(logger, "CreateJobObject failed", Marshal.GetLastWin32Error());
return IntPtr.Zero;
}
var info = new JOBOBJECT_EXTENDED_LIMIT_INFORMATION
{
BasicLimitInformation = new JOBOBJECT_BASIC_LIMIT_INFORMATION
{
LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE,
},
};
var length = Marshal.SizeOf<JOBOBJECT_EXTENDED_LIMIT_INFORMATION>();
var ptr = Marshal.AllocHGlobal(length);
try
{
Marshal.StructureToPtr(info, ptr, false);
if (!SetInformationJobObject(job, JobObjectExtendedLimitInformation, ptr, (uint)length))
{
WarnOnce(logger, "SetInformationJobObject failed", Marshal.GetLastWin32Error());
CloseHandle(job);
return IntPtr.Zero;
}
}
finally
{
Marshal.FreeHGlobal(ptr);
}
logger.LogInformation("Process job object created — spawned claude processes will be killed if the worker dies.");
return job;
}
private static void WarnOnce(ILogger logger, string message, int win32Error)
{
if (_warned) return;
_warned = true;
logger.LogWarning(
"Process leak guard inactive: {Message} (win32={Win32Error}). Orphaned claude processes may survive a hard worker kill.",
message, win32Error);
}
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
private static extern IntPtr CreateJobObject(IntPtr lpJobAttributes, string? lpName);
[DllImport("kernel32.dll", SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool SetInformationJobObject(
IntPtr hJob, int jobObjectInfoClass, IntPtr lpJobObjectInfo, uint cbJobObjectInfoLength);
[DllImport("kernel32.dll", SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool AssignProcessToJobObject(IntPtr hJob, IntPtr hProcess);
[DllImport("kernel32.dll", SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool CloseHandle(IntPtr hObject);
[StructLayout(LayoutKind.Sequential)]
private struct JOBOBJECT_BASIC_LIMIT_INFORMATION
{
public long PerProcessUserTimeLimit;
public long PerJobUserTimeLimit;
public uint LimitFlags;
public UIntPtr MinimumWorkingSetSize;
public UIntPtr MaximumWorkingSetSize;
public uint ActiveProcessLimit;
public UIntPtr Affinity;
public uint PriorityClass;
public uint SchedulingClass;
}
[StructLayout(LayoutKind.Sequential)]
private struct IO_COUNTERS
{
public ulong ReadOperationCount;
public ulong WriteOperationCount;
public ulong OtherOperationCount;
public ulong ReadTransferCount;
public ulong WriteTransferCount;
public ulong OtherTransferCount;
}
[StructLayout(LayoutKind.Sequential)]
private struct JOBOBJECT_EXTENDED_LIMIT_INFORMATION
{
public JOBOBJECT_BASIC_LIMIT_INFORMATION BasicLimitInformation;
public IO_COUNTERS IoInfo;
public UIntPtr ProcessMemoryLimit;
public UIntPtr JobMemoryLimit;
public UIntPtr PeakProcessMemoryUsed;
public UIntPtr PeakJobMemoryUsed;
}
}