fix(worker): kill spawned claude trees when the worker dies
Spawned claude processes were only torn down on graceful cancellation (Process.Kill(entireProcessTree)). A hard worker death — Task Manager End Task, crash, OS restart, installer update — ran no cleanup, orphaning the claude->node->conhost tree, which lingered and piled up across restarts. Assign every spawned claude process to a Windows Job Object with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. The worker holds the only handle, so when it terminates for any reason the OS tears down the whole job. No-op on non-Windows; best-effort with a one-time warning if the Win32 calls fail.
This commit is contained in:
@@ -46,6 +46,7 @@ public sealed class ClaudeProcess : IClaudeProcess
|
||||
|
||||
using var process = new Process { StartInfo = psi };
|
||||
process.Start();
|
||||
ProcessJobObject.Assign(process, _logger);
|
||||
|
||||
await process.StandardInput.WriteAsync(prompt);
|
||||
process.StandardInput.Close();
|
||||
|
||||
@@ -45,6 +45,7 @@ public sealed class ProcessClaudeStreamTransport : IClaudeStreamTransport
|
||||
|
||||
_process = new Process { StartInfo = psi };
|
||||
_process.Start();
|
||||
ProcessJobObject.Assign(_process, _logger);
|
||||
|
||||
// Keep stdin open — turns are driven by WriteLineAsync calls.
|
||||
_process.StandardInput.AutoFlush = false;
|
||||
|
||||
157
src/ClaudeDo.Worker/Runner/ProcessJobObject.cs
Normal file
157
src/ClaudeDo.Worker/Runner/ProcessJobObject.cs
Normal file
@@ -0,0 +1,157 @@
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Versioning;
|
||||
|
||||
namespace ClaudeDo.Worker.Runner;
|
||||
|
||||
// Process leak guard (Windows). The worker spawns `claude` (which spawns node + conhost).
|
||||
// Cancellation kills the tree via Process.Kill(entireProcessTree: true), but a *hard* worker
|
||||
// death — Task Manager End Task, crash, OS restart, installer update — runs no cleanup, so the
|
||||
// whole claude→node→conhost tree is orphaned and lingers. Over many restarts these pile up.
|
||||
//
|
||||
// Fix: one Job Object with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, created once and held for the
|
||||
// worker's lifetime. Every spawned claude process is assigned to it. The worker holds the only
|
||||
// handle to the job; when the worker process terminates for ANY reason the OS closes that handle
|
||||
// and tears down every process still in the job. This is the only mechanism that survives a hard
|
||||
// kill — which entireProcessTree cannot.
|
||||
internal static class ProcessJobObject
|
||||
{
|
||||
private const int JobObjectExtendedLimitInformation = 9;
|
||||
private const uint JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x2000;
|
||||
|
||||
private static readonly object _gate = new();
|
||||
private static IntPtr _job = IntPtr.Zero; // held for process lifetime, never closed
|
||||
private static bool _disabled;
|
||||
private static bool _warned;
|
||||
|
||||
public static void Assign(Process process, ILogger logger)
|
||||
{
|
||||
if (!OperatingSystem.IsWindows() || _disabled) return;
|
||||
|
||||
lock (_gate)
|
||||
{
|
||||
if (_disabled) return;
|
||||
|
||||
if (_job == IntPtr.Zero)
|
||||
{
|
||||
_job = TryCreateJob(logger);
|
||||
if (_job == IntPtr.Zero)
|
||||
{
|
||||
_disabled = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
if (!AssignProcessToJobObject(_job, process.Handle))
|
||||
WarnOnce(logger, "AssignProcessToJobObject failed", Marshal.GetLastWin32Error());
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// process may have already exited, or handle is unavailable — best effort.
|
||||
WarnOnce(logger, $"could not assign process to job: {ex.Message}", 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[SupportedOSPlatform("windows")]
|
||||
private static IntPtr TryCreateJob(ILogger logger)
|
||||
{
|
||||
var job = CreateJobObject(IntPtr.Zero, null);
|
||||
if (job == IntPtr.Zero)
|
||||
{
|
||||
WarnOnce(logger, "CreateJobObject failed", Marshal.GetLastWin32Error());
|
||||
return IntPtr.Zero;
|
||||
}
|
||||
|
||||
var info = new JOBOBJECT_EXTENDED_LIMIT_INFORMATION
|
||||
{
|
||||
BasicLimitInformation = new JOBOBJECT_BASIC_LIMIT_INFORMATION
|
||||
{
|
||||
LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE,
|
||||
},
|
||||
};
|
||||
|
||||
var length = Marshal.SizeOf<JOBOBJECT_EXTENDED_LIMIT_INFORMATION>();
|
||||
var ptr = Marshal.AllocHGlobal(length);
|
||||
try
|
||||
{
|
||||
Marshal.StructureToPtr(info, ptr, false);
|
||||
if (!SetInformationJobObject(job, JobObjectExtendedLimitInformation, ptr, (uint)length))
|
||||
{
|
||||
WarnOnce(logger, "SetInformationJobObject failed", Marshal.GetLastWin32Error());
|
||||
CloseHandle(job);
|
||||
return IntPtr.Zero;
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
Marshal.FreeHGlobal(ptr);
|
||||
}
|
||||
|
||||
logger.LogInformation("Process job object created — spawned claude processes will be killed if the worker dies.");
|
||||
return job;
|
||||
}
|
||||
|
||||
private static void WarnOnce(ILogger logger, string message, int win32Error)
|
||||
{
|
||||
if (_warned) return;
|
||||
_warned = true;
|
||||
logger.LogWarning(
|
||||
"Process leak guard inactive: {Message} (win32={Win32Error}). Orphaned claude processes may survive a hard worker kill.",
|
||||
message, win32Error);
|
||||
}
|
||||
|
||||
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
|
||||
private static extern IntPtr CreateJobObject(IntPtr lpJobAttributes, string? lpName);
|
||||
|
||||
[DllImport("kernel32.dll", SetLastError = true)]
|
||||
[return: MarshalAs(UnmanagedType.Bool)]
|
||||
private static extern bool SetInformationJobObject(
|
||||
IntPtr hJob, int jobObjectInfoClass, IntPtr lpJobObjectInfo, uint cbJobObjectInfoLength);
|
||||
|
||||
[DllImport("kernel32.dll", SetLastError = true)]
|
||||
[return: MarshalAs(UnmanagedType.Bool)]
|
||||
private static extern bool AssignProcessToJobObject(IntPtr hJob, IntPtr hProcess);
|
||||
|
||||
[DllImport("kernel32.dll", SetLastError = true)]
|
||||
[return: MarshalAs(UnmanagedType.Bool)]
|
||||
private static extern bool CloseHandle(IntPtr hObject);
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
private struct JOBOBJECT_BASIC_LIMIT_INFORMATION
|
||||
{
|
||||
public long PerProcessUserTimeLimit;
|
||||
public long PerJobUserTimeLimit;
|
||||
public uint LimitFlags;
|
||||
public UIntPtr MinimumWorkingSetSize;
|
||||
public UIntPtr MaximumWorkingSetSize;
|
||||
public uint ActiveProcessLimit;
|
||||
public UIntPtr Affinity;
|
||||
public uint PriorityClass;
|
||||
public uint SchedulingClass;
|
||||
}
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
private struct IO_COUNTERS
|
||||
{
|
||||
public ulong ReadOperationCount;
|
||||
public ulong WriteOperationCount;
|
||||
public ulong OtherOperationCount;
|
||||
public ulong ReadTransferCount;
|
||||
public ulong WriteTransferCount;
|
||||
public ulong OtherTransferCount;
|
||||
}
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
private struct JOBOBJECT_EXTENDED_LIMIT_INFORMATION
|
||||
{
|
||||
public JOBOBJECT_BASIC_LIMIT_INFORMATION BasicLimitInformation;
|
||||
public IO_COUNTERS IoInfo;
|
||||
public UIntPtr ProcessMemoryLimit;
|
||||
public UIntPtr JobMemoryLimit;
|
||||
public UIntPtr PeakProcessMemoryUsed;
|
||||
public UIntPtr PeakJobMemoryUsed;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user