fix(worker): kill spawned claude trees when the worker dies
Spawned claude processes were only torn down on graceful cancellation (Process.Kill(entireProcessTree)). A hard worker death — Task Manager End Task, crash, OS restart, installer update — ran no cleanup, orphaning the claude->node->conhost tree, which lingered and piled up across restarts. Assign every spawned claude process to a Windows Job Object with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. The worker holds the only handle, so when it terminates for any reason the OS tears down the whole job. No-op on non-Windows; best-effort with a one-time warning if the Win32 calls fail.
This commit is contained in:
@@ -46,6 +46,7 @@ public sealed class ClaudeProcess : IClaudeProcess
|
|||||||
|
|
||||||
using var process = new Process { StartInfo = psi };
|
using var process = new Process { StartInfo = psi };
|
||||||
process.Start();
|
process.Start();
|
||||||
|
ProcessJobObject.Assign(process, _logger);
|
||||||
|
|
||||||
await process.StandardInput.WriteAsync(prompt);
|
await process.StandardInput.WriteAsync(prompt);
|
||||||
process.StandardInput.Close();
|
process.StandardInput.Close();
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ public sealed class ProcessClaudeStreamTransport : IClaudeStreamTransport
|
|||||||
|
|
||||||
_process = new Process { StartInfo = psi };
|
_process = new Process { StartInfo = psi };
|
||||||
_process.Start();
|
_process.Start();
|
||||||
|
ProcessJobObject.Assign(_process, _logger);
|
||||||
|
|
||||||
// Keep stdin open — turns are driven by WriteLineAsync calls.
|
// Keep stdin open — turns are driven by WriteLineAsync calls.
|
||||||
_process.StandardInput.AutoFlush = false;
|
_process.StandardInput.AutoFlush = false;
|
||||||
|
|||||||
157
src/ClaudeDo.Worker/Runner/ProcessJobObject.cs
Normal file
157
src/ClaudeDo.Worker/Runner/ProcessJobObject.cs
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
using System.Diagnostics;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
using System.Runtime.Versioning;
|
||||||
|
|
||||||
|
namespace ClaudeDo.Worker.Runner;
|
||||||
|
|
||||||
|
// Process leak guard (Windows). The worker spawns `claude` (which spawns node + conhost).
|
||||||
|
// Cancellation kills the tree via Process.Kill(entireProcessTree: true), but a *hard* worker
|
||||||
|
// death — Task Manager End Task, crash, OS restart, installer update — runs no cleanup, so the
|
||||||
|
// whole claude→node→conhost tree is orphaned and lingers. Over many restarts these pile up.
|
||||||
|
//
|
||||||
|
// Fix: one Job Object with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, created once and held for the
|
||||||
|
// worker's lifetime. Every spawned claude process is assigned to it. The worker holds the only
|
||||||
|
// handle to the job; when the worker process terminates for ANY reason the OS closes that handle
|
||||||
|
// and tears down every process still in the job. This is the only mechanism that survives a hard
|
||||||
|
// kill — which entireProcessTree cannot.
|
||||||
|
internal static class ProcessJobObject
|
||||||
|
{
|
||||||
|
private const int JobObjectExtendedLimitInformation = 9;
|
||||||
|
private const uint JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x2000;
|
||||||
|
|
||||||
|
private static readonly object _gate = new();
|
||||||
|
private static IntPtr _job = IntPtr.Zero; // held for process lifetime, never closed
|
||||||
|
private static bool _disabled;
|
||||||
|
private static bool _warned;
|
||||||
|
|
||||||
|
public static void Assign(Process process, ILogger logger)
|
||||||
|
{
|
||||||
|
if (!OperatingSystem.IsWindows() || _disabled) return;
|
||||||
|
|
||||||
|
lock (_gate)
|
||||||
|
{
|
||||||
|
if (_disabled) return;
|
||||||
|
|
||||||
|
if (_job == IntPtr.Zero)
|
||||||
|
{
|
||||||
|
_job = TryCreateJob(logger);
|
||||||
|
if (_job == IntPtr.Zero)
|
||||||
|
{
|
||||||
|
_disabled = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
if (!AssignProcessToJobObject(_job, process.Handle))
|
||||||
|
WarnOnce(logger, "AssignProcessToJobObject failed", Marshal.GetLastWin32Error());
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
// process may have already exited, or handle is unavailable — best effort.
|
||||||
|
WarnOnce(logger, $"could not assign process to job: {ex.Message}", 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[SupportedOSPlatform("windows")]
|
||||||
|
private static IntPtr TryCreateJob(ILogger logger)
|
||||||
|
{
|
||||||
|
var job = CreateJobObject(IntPtr.Zero, null);
|
||||||
|
if (job == IntPtr.Zero)
|
||||||
|
{
|
||||||
|
WarnOnce(logger, "CreateJobObject failed", Marshal.GetLastWin32Error());
|
||||||
|
return IntPtr.Zero;
|
||||||
|
}
|
||||||
|
|
||||||
|
var info = new JOBOBJECT_EXTENDED_LIMIT_INFORMATION
|
||||||
|
{
|
||||||
|
BasicLimitInformation = new JOBOBJECT_BASIC_LIMIT_INFORMATION
|
||||||
|
{
|
||||||
|
LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
var length = Marshal.SizeOf<JOBOBJECT_EXTENDED_LIMIT_INFORMATION>();
|
||||||
|
var ptr = Marshal.AllocHGlobal(length);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Marshal.StructureToPtr(info, ptr, false);
|
||||||
|
if (!SetInformationJobObject(job, JobObjectExtendedLimitInformation, ptr, (uint)length))
|
||||||
|
{
|
||||||
|
WarnOnce(logger, "SetInformationJobObject failed", Marshal.GetLastWin32Error());
|
||||||
|
CloseHandle(job);
|
||||||
|
return IntPtr.Zero;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
Marshal.FreeHGlobal(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.LogInformation("Process job object created — spawned claude processes will be killed if the worker dies.");
|
||||||
|
return job;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void WarnOnce(ILogger logger, string message, int win32Error)
|
||||||
|
{
|
||||||
|
if (_warned) return;
|
||||||
|
_warned = true;
|
||||||
|
logger.LogWarning(
|
||||||
|
"Process leak guard inactive: {Message} (win32={Win32Error}). Orphaned claude processes may survive a hard worker kill.",
|
||||||
|
message, win32Error);
|
||||||
|
}
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
|
||||||
|
private static extern IntPtr CreateJobObject(IntPtr lpJobAttributes, string? lpName);
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll", SetLastError = true)]
|
||||||
|
[return: MarshalAs(UnmanagedType.Bool)]
|
||||||
|
private static extern bool SetInformationJobObject(
|
||||||
|
IntPtr hJob, int jobObjectInfoClass, IntPtr lpJobObjectInfo, uint cbJobObjectInfoLength);
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll", SetLastError = true)]
|
||||||
|
[return: MarshalAs(UnmanagedType.Bool)]
|
||||||
|
private static extern bool AssignProcessToJobObject(IntPtr hJob, IntPtr hProcess);
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll", SetLastError = true)]
|
||||||
|
[return: MarshalAs(UnmanagedType.Bool)]
|
||||||
|
private static extern bool CloseHandle(IntPtr hObject);
|
||||||
|
|
||||||
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
|
private struct JOBOBJECT_BASIC_LIMIT_INFORMATION
|
||||||
|
{
|
||||||
|
public long PerProcessUserTimeLimit;
|
||||||
|
public long PerJobUserTimeLimit;
|
||||||
|
public uint LimitFlags;
|
||||||
|
public UIntPtr MinimumWorkingSetSize;
|
||||||
|
public UIntPtr MaximumWorkingSetSize;
|
||||||
|
public uint ActiveProcessLimit;
|
||||||
|
public UIntPtr Affinity;
|
||||||
|
public uint PriorityClass;
|
||||||
|
public uint SchedulingClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
|
private struct IO_COUNTERS
|
||||||
|
{
|
||||||
|
public ulong ReadOperationCount;
|
||||||
|
public ulong WriteOperationCount;
|
||||||
|
public ulong OtherOperationCount;
|
||||||
|
public ulong ReadTransferCount;
|
||||||
|
public ulong WriteTransferCount;
|
||||||
|
public ulong OtherTransferCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
|
private struct JOBOBJECT_EXTENDED_LIMIT_INFORMATION
|
||||||
|
{
|
||||||
|
public JOBOBJECT_BASIC_LIMIT_INFORMATION BasicLimitInformation;
|
||||||
|
public IO_COUNTERS IoInfo;
|
||||||
|
public UIntPtr ProcessMemoryLimit;
|
||||||
|
public UIntPtr JobMemoryLimit;
|
||||||
|
public UIntPtr PeakProcessMemoryUsed;
|
||||||
|
public UIntPtr PeakJobMemoryUsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user