Skip to content

Commit 1108683

Browse files
kouveljkotas
authored andcommitted
Fix a thread pool deadlock issue (dotnet#6822)
Fixes dotnet#6780: - `ShouldStopProcessingWorkNow` was checking the count of existing threads and decrementing the count of processing threads. Due to differences from CoreCLR, the condition needs to be different. The effect was that when hill climbing decides to crease the thread count goal, `ShouldStop...` stops all threads that were processing work if there are enough existing threads, and even though there are several thread requests, the now-all-waiting threads are not released to process more work, leading to deadlock. The condition in `ShouldAdjustMaxWorkersActive` was also incorrect, fixed both. - Fixed a few other small things that I saw
1 parent b06cafa commit 1108683

File tree

6 files changed

+115
-36
lines changed

6 files changed

+115
-36
lines changed

src/System.Private.CoreLib/src/System/Threading/ClrThreadPool.HillClimbing.cs

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,19 @@ private partial class HillClimbing
2323
private static HillClimbing CreateHillClimber()
2424
{
2525
// Default values pulled from CoreCLR
26-
return new HillClimbing(wavePeriod: AppContextConfigHelper.GetInt32Config("HillClimbing_WavePeriod", 4),
27-
maxWaveMagnitude: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxWaveMagnitude", 20),
28-
waveMagnitudeMultiplier: AppContextConfigHelper.GetInt32Config("HillClimbing_WaveMagnitudeMultiplier", 100) / 100.0,
29-
waveHistorySize: AppContextConfigHelper.GetInt32Config("HillClimbing_WaveHistorySize", 8),
30-
targetThroughputRatio: AppContextConfigHelper.GetInt32Config("HillClimbing_Bias", 15) / 100.0,
31-
targetSignalToNoiseRatio: AppContextConfigHelper.GetInt32Config("HillClimbing_TargetSignalToNoiseRatio", 300) / 100.0,
32-
maxChangePerSecond: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxChangePerSecond", 4),
33-
maxChangePerSample: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxChangePerSample", 20),
34-
sampleIntervalMsLow: AppContextConfigHelper.GetInt32Config("HillClimbing_SampleIntervalLow", DefaultSampleIntervalMsLow, allowNegative: false),
35-
sampleIntervalMsHigh: AppContextConfigHelper.GetInt32Config("HillClimbing_SampleIntervalHigh", DefaultSampleIntervalMsHigh, allowNegative: false),
36-
errorSmoothingFactor: AppContextConfigHelper.GetInt32Config("HillClimbing_ErrorSmoothingFactor", 1) / 100.0,
37-
gainExponent: AppContextConfigHelper.GetInt32Config("HillClimbing_GainExponent", 200) / 100.0,
38-
maxSampleError: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxSampleErrorPercent", 15) / 100.0
26+
return new HillClimbing(wavePeriod: AppContextConfigHelper.GetInt32Config("HillClimbing_WavePeriod", 4, false),
27+
maxWaveMagnitude: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxWaveMagnitude", 20, false),
28+
waveMagnitudeMultiplier: AppContextConfigHelper.GetInt32Config("HillClimbing_WaveMagnitudeMultiplier", 100, false) / 100.0,
29+
waveHistorySize: AppContextConfigHelper.GetInt32Config("HillClimbing_WaveHistorySize", 8, false),
30+
targetThroughputRatio: AppContextConfigHelper.GetInt32Config("HillClimbing_Bias", 15, false) / 100.0,
31+
targetSignalToNoiseRatio: AppContextConfigHelper.GetInt32Config("HillClimbing_TargetSignalToNoiseRatio", 300, false) / 100.0,
32+
maxChangePerSecond: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxChangePerSecond", 4, false),
33+
maxChangePerSample: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxChangePerSample", 20, false),
34+
sampleIntervalMsLow: AppContextConfigHelper.GetInt32Config("HillClimbing_SampleIntervalLow", DefaultSampleIntervalMsLow, false),
35+
sampleIntervalMsHigh: AppContextConfigHelper.GetInt32Config("HillClimbing_SampleIntervalHigh", DefaultSampleIntervalMsHigh, false),
36+
errorSmoothingFactor: AppContextConfigHelper.GetInt32Config("HillClimbing_ErrorSmoothingFactor", 1, false) / 100.0,
37+
gainExponent: AppContextConfigHelper.GetInt32Config("HillClimbing_GainExponent", 200, false) / 100.0,
38+
maxSampleError: AppContextConfigHelper.GetInt32Config("HillClimbing_MaxSampleErrorPercent", 15, false) / 100.0
3939
);
4040
}
4141
private const int LogCapacity = 200;

src/System.Private.CoreLib/src/System/Threading/ClrThreadPool.WorkerThread.cs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ private static class WorkerThread
2323
private static void WorkerThreadStart()
2424
{
2525
ClrThreadPoolEventSource.Log.WorkerThreadStart(ThreadCounts.VolatileReadCounts(ref ThreadPoolInstance._separated.counts).numExistingThreads);
26-
RuntimeThread currentThread = RuntimeThread.CurrentThread;
26+
2727
while (true)
2828
{
2929
while (WaitForRequest())
@@ -189,7 +189,14 @@ internal static bool ShouldStopProcessingWorkNow()
189189
ThreadCounts counts = ThreadCounts.VolatileReadCounts(ref ThreadPoolInstance._separated.counts);
190190
while (true)
191191
{
192-
if (counts.numExistingThreads <= counts.numThreadsGoal)
192+
/// When there are more threads processing work than the thread count goal, hill climbing must have decided
193+
/// to decrease the number of threads. Stop processing if the counts can be updated. We may have more
194+
/// threads existing than the thread count goal and that is ok, the cold ones will eventually time out if
195+
/// the thread count goal is not increased again. This logic is a bit different from the original CoreCLR
196+
/// code from which this implementation was ported, which turns a processing thread into a retired thread
197+
/// and checks for pending requests like <see cref="RemoveWorkingWorker"/>. In this implementation there are
198+
/// no retired threads, so only the count of threads processing work is considered.
199+
if (counts.numProcessingWork <= counts.numThreadsGoal)
193200
{
194201
return false;
195202
}

src/System.Private.CoreLib/src/System/Threading/ClrThreadPool.cs

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,31 @@ namespace System.Threading
99
/// <summary>
1010
/// A thread-pool run and managed on the CLR.
1111
/// </summary>
12-
internal partial class ClrThreadPool
12+
internal sealed partial class ClrThreadPool
1313
{
1414
#pragma warning disable IDE1006 // Naming Styles
1515
public static readonly ClrThreadPool ThreadPoolInstance = new ClrThreadPool();
1616
#pragma warning restore IDE1006 // Naming Styles
1717

1818
private const int ThreadPoolThreadTimeoutMs = 20 * 1000; // If you change this make sure to change the timeout times in the tests.
19-
19+
20+
#if BIT64
2021
private const short MaxPossibleThreadCount = short.MaxValue;
22+
#elif BIT32
23+
private const short MaxPossibleThreadCount = 1023;
24+
#else
25+
#error Unknown platform
26+
#endif
2127

2228
private const int CpuUtilizationHigh = 95;
2329
private const int CpuUtilizationLow = 80;
2430
private int _cpuUtilization = 0;
2531

32+
private static readonly short s_forcedMinWorkerThreads = AppContextConfigHelper.GetInt16Config("System.Threading.ThreadPool.MinThreads", 0, false);
33+
private static readonly short s_forcedMaxWorkerThreads = AppContextConfigHelper.GetInt16Config("System.Threading.ThreadPool.MaxThreads", 0, false);
2634

27-
private static readonly short s_forcedMinWorkerThreads = AppContextConfigHelper.GetInt16Config("System.Threading.ThreadPool.MinThreads", 0);
28-
private static readonly short s_forcedMaxWorkerThreads = AppContextConfigHelper.GetInt16Config("System.Threading.ThreadPool.MaxThreads", 0);
29-
30-
private short _minThreads = (short)ThreadPoolGlobals.processorCount;
31-
private short _maxThreads = MaxPossibleThreadCount;
35+
private short _minThreads;
36+
private short _maxThreads;
3237
private readonly LowLevelLock _maxMinThreadLock = new LowLevelLock();
3338

3439
[StructLayout(LayoutKind.Explicit, Size = CacheLineSize * 5)]
@@ -62,11 +67,23 @@ private struct CacheLineSeparated
6267

6368
private ClrThreadPool()
6469
{
70+
_minThreads = s_forcedMinWorkerThreads > 0 ? s_forcedMinWorkerThreads : (short)ThreadPoolGlobals.processorCount;
71+
if (_minThreads > MaxPossibleThreadCount)
72+
{
73+
_minThreads = MaxPossibleThreadCount;
74+
}
75+
76+
_maxThreads = s_forcedMaxWorkerThreads > 0 ? s_forcedMaxWorkerThreads : MaxPossibleThreadCount;
77+
if (_maxThreads < _minThreads)
78+
{
79+
_maxThreads = _minThreads;
80+
}
81+
6582
_separated = new CacheLineSeparated
6683
{
6784
counts = new ThreadCounts
6885
{
69-
numThreadsGoal = s_forcedMinWorkerThreads > 0 ? s_forcedMinWorkerThreads : _minThreads
86+
numThreadsGoal = _minThreads
7087
}
7188
};
7289
}
@@ -181,23 +198,16 @@ internal bool NotifyWorkItemComplete()
181198
Interlocked.Increment(ref _completionCount);
182199
Volatile.Write(ref _separated.lastDequeueTime, Environment.TickCount);
183200

184-
if (ShouldAdjustMaxWorkersActive())
201+
if (ShouldAdjustMaxWorkersActive() && _hillClimbingThreadAdjustmentLock.TryAcquire())
185202
{
186-
bool acquiredLock = _hillClimbingThreadAdjustmentLock.TryAcquire();
187203
try
188204
{
189-
if (acquiredLock)
190-
{
191-
AdjustMaxWorkersActive();
192-
}
205+
AdjustMaxWorkersActive();
193206
}
194207
finally
195208
{
196-
if (acquiredLock)
197-
{
198-
_hillClimbingThreadAdjustmentLock.Release();
199-
}
200-
}
209+
_hillClimbingThreadAdjustmentLock.Release();
210+
}
201211
}
202212

203213
return !WorkerThread.ShouldStopProcessingWorkNow();
@@ -272,8 +282,15 @@ private bool ShouldAdjustMaxWorkersActive()
272282
int elapsedInterval = Environment.TickCount - priorTime;
273283
if(elapsedInterval >= requiredInterval)
274284
{
285+
/// Avoid trying to adjust the thread count goal if there are already more threads than the thread count goal.
286+
/// In that situation, hill climbing must have previously decided to decrease the thread count goal, so let's
287+
/// wait until the system responds to that change before calling into hill climbing again. This condition should
288+
/// be the opposite of the condition in <see cref="WorkerThread.ShouldStopProcessingWorkNow"/> that causes
289+
/// threads processing work to stop in response to a decreased thread count goal. The logic here is a bit
290+
/// different from the original CoreCLR code from which this implementation was ported because in this
291+
/// implementation there are no retired threads, so only the count of threads processing work is considered.
275292
ThreadCounts counts = ThreadCounts.VolatileReadCounts(ref _separated.counts);
276-
return counts.numExistingThreads >= counts.numThreadsGoal;
293+
return counts.numProcessingWork <= counts.numThreadsGoal;
277294
}
278295
return false;
279296
}

src/System.Private.CoreLib/src/System/Threading/LowLevelLifoSemaphore.Windows.cs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@ internal sealed class LowLevelLifoSemaphore : IDisposable
2222
public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount)
2323
{
2424
Debug.Assert(initialSignalCount >= 0, "Windows LowLevelLifoSemaphore does not support a negative signal count"); // TODO: Track actual signal count to enable this
25-
_completionPort = Interop.Kernel32.CreateIoCompletionPort(new IntPtr(-1), IntPtr.Zero, UIntPtr.Zero, 1);
25+
Debug.Assert(maximumSignalCount > 0);
26+
Debug.Assert(initialSignalCount <= maximumSignalCount);
27+
28+
_completionPort =
29+
Interop.Kernel32.CreateIoCompletionPort(new IntPtr(-1), IntPtr.Zero, UIntPtr.Zero, maximumSignalCount);
2630
if (_completionPort == IntPtr.Zero)
2731
{
2832
var error = Marshal.GetLastWin32Error();
@@ -33,15 +37,27 @@ public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount)
3337
Release(initialSignalCount);
3438
}
3539

40+
~LowLevelLifoSemaphore()
41+
{
42+
if (_completionPort != IntPtr.Zero)
43+
{
44+
Dispose();
45+
}
46+
}
47+
3648
public bool Wait(int timeoutMs)
3749
{
50+
Debug.Assert(timeoutMs >= -1);
51+
3852
bool success = Interop.Kernel32.GetQueuedCompletionStatus(_completionPort, out var numberOfBytes, out var completionKey, out var pointerToOverlapped, timeoutMs);
3953
Debug.Assert(success || (Marshal.GetLastWin32Error() == WaitHandle.WaitTimeout));
4054
return success;
4155
}
4256

4357
public int Release(int count)
4458
{
59+
Debug.Assert(count > 0);
60+
4561
for (int i = 0; i < count; i++)
4662
{
4763
if(!Interop.Kernel32.PostQueuedCompletionStatus(_completionPort, 1, UIntPtr.Zero, IntPtr.Zero))
@@ -57,7 +73,11 @@ public int Release(int count)
5773

5874
public void Dispose()
5975
{
76+
Debug.Assert(_completionPort != IntPtr.Zero);
77+
6078
Interop.Kernel32.CloseHandle(_completionPort);
79+
_completionPort = IntPtr.Zero;
80+
GC.SuppressFinalize(this);
6181
}
6282
}
6383
}

src/System.Private.CoreLib/src/System/Threading/ThreadPool.Portable.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,11 @@ private void SignalUserWaitHandle()
207207
if (handleValue != IntPtr.Zero && handleValue != (IntPtr)(-1))
208208
{
209209
Debug.Assert(handleValue == handle.DangerousGetHandle());
210+
#if PLATFORM_WINDOWS
211+
Interop.Kernel32.SetEvent(handle);
212+
#else
210213
WaitSubsystem.SetEvent(handleValue);
214+
#endif
211215
}
212216
}
213217
finally

tests/src/Simple/Threading/Threading.cs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ public static int Main()
5050
Console.WriteLine(" ThreadPoolTests.ThreadPoolCanPickUpMultipleJobsWhenThreadsAreAvailable");
5151
ThreadPoolTests.ThreadPoolCanPickUpMultipleJobsWhenThreadsAreAvailable();
5252

53+
Console.WriteLine(" ThreadPoolTests.ThreadPoolCanProcessManyWorkItemsInParallelWithoutDeadlocking");
54+
ThreadPoolTests.ThreadPoolCanProcessManyWorkItemsInParallelWithoutDeadlocking();
55+
5356
// This test takes a long time to run (min 42 seconds sleeping). Enable for manual testing.
5457
// Console.WriteLine(" ThreadPoolTests.RunJobsAfterThreadTimeout");
5558
// ThreadPoolTests.RunJobsAfterThreadTimeout();
@@ -950,6 +953,34 @@ void Job(object _)
950953
e0.Set();
951954
}
952955

956+
// See https://github.com/dotnet/corert/issues/6780
957+
[Fact]
958+
public static void ThreadPoolCanProcessManyWorkItemsInParallelWithoutDeadlocking()
959+
{
960+
int iterationCount = 100_000;
961+
var done = new ManualResetEvent(false);
962+
963+
WaitCallback wc = null;
964+
wc = data =>
965+
{
966+
int n = Interlocked.Decrement(ref iterationCount);
967+
if (n == 0)
968+
{
969+
done.Set();
970+
}
971+
else if (n > 0)
972+
{
973+
ThreadPool.QueueUserWorkItem(wc);
974+
}
975+
};
976+
977+
for (int i = 0, n = Environment.ProcessorCount; i < n; ++i)
978+
{
979+
ThreadPool.QueueUserWorkItem(wc);
980+
}
981+
done.WaitOne();
982+
}
983+
953984
private static WaitCallback CreateRecursiveJob(int jobCount, int targetJobCount, AutoResetEvent testJobCompleted)
954985
{
955986
return _ =>

0 commit comments

Comments
 (0)