Skip to content

Commit 67c5a87

Browse files
committed
[threads] Use refcounts for coordinating finalization and detaching
Reverts a29ad08 (mono#9914) The basic problem we want to solve is the following: 1. All access to InternalThread:state must be protected by the InternalThread:synch_cs mutex 2. We must destroy the mutex when we are done with the thread. 3. We don't know which happens later - detaching the machine thread or finalizing its InternalThread managed object. The solution is to replace InternalThread:synch_cs by InternalThread:longlived which is a refcounted struct that holds the synch_cs. The refcount starts out at 2 when the thread is attached to the runtime and when we create the managed InternalThread object that represents it. Both detaching and finalizing the managed object will decrement the refounct, and whichever one happens last will be responsible for destroying the mutex. This addresses mono#11956 which was a race condition due to the previous attempt to fix this lifetime problem. The previous attempt incorrectly used CAS in mono_thread_detach_internal while continuing to use locking of synch_cs elsewhere. In particular mono_thread_suspend_all_other_threads could race with mono_thread_detach_internal: it expects to take the thread lock and test thread->state and use the thread->suspended event, while detaching deletes thread->suspended without taking a lock. As a result we had a concurrency bug: in suspend_all_other_threads it's possible to see both the old (non-Stopped) value of thread->state and the new (NULL) value of thread->suspended. Which leads to crashes. --- Background - why we don't know if detaching or finalization happens first. 1. InternalThread normally outlives the machine thread. This can happen because when one thread starts another it can hold a reference to the fresh thread's Thread object which holds a reference to the InternalThread. So after the machine thread is done, the older thread can query the state of the younger Thread object. This is the normal situation. 2. During shutdown we can have the opposite situation: the InternalThread objects are finalized first (this happens during root domain finalization), but the machine threads are still running, and they may still return to start_wrapper_internal and call detach_internal. So in this case we have an InternalThread whose finalizer ran first and detach will run second.
1 parent 00437dc commit 67c5a87

File tree

4 files changed

+63
-50
lines changed

4 files changed

+63
-50
lines changed

mcs/class/corlib/System.Threading/Thread.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ keep as an object to avoid triggering its class constructor when not needed */
7171
internal int _serialized_principal_version;
7272
private IntPtr appdomain_refs;
7373
private int interruption_requested;
74-
private IntPtr synch_cs;
74+
private IntPtr longlived;
7575
internal bool threadpool_thread;
7676
private bool thread_interrupt_requested;
7777
/* These are used from managed code */

mono/metadata/object-internals.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ struct _MonoInternalThread {
510510
gpointer unused3;
511511
gunichar2 *name;
512512
guint32 name_len;
513-
guint32 state;
513+
guint32 state; /* must be accessed while longlived->synch_cs is locked */
514514
MonoException *abort_exc;
515515
int abort_state_handle;
516516
guint64 tid; /* This is accessed as a gsize in the code (so it can hold a 64bit pointer on systems that need it), but needs to reserve 64 bits of space on all machines as it corresponds to a field in managed code */
@@ -524,7 +524,10 @@ struct _MonoInternalThread {
524524
gpointer appdomain_refs;
525525
/* This is modified using atomic ops, so keep it a gint32 */
526526
gint32 __interruption_requested;
527-
MonoCoopMutex *synch_cs;
527+
/* data that must live as long as this managed object is not finalized
528+
* or as long as the underlying thread is attached, whichever is
529+
* longer */
530+
MonoLongLivedThreadData *longlived;
528531
MonoBoolean threadpool_thread;
529532
MonoBoolean thread_interrupt_requested;
530533
int stack_size;

mono/metadata/threads-types.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,20 @@ mono_thread_create_internal_handle (MonoDomain *domain, T func, gpointer arg, Mo
106106
}
107107
#endif
108108

109+
/* Data owned by a MonoInternalThread that must live until both the finalizer
110+
* for MonoInternalThread has run, and the underlying machine thread has
111+
* detached.
112+
*
113+
* Normally a thread is first detached and then the InternalThread object is
114+
* finalized and collected. However during shutdown, when the root domain is
115+
* finalized, all the InternalThread objects are finalized first and the
116+
* machine threads are detached later.
117+
*/
118+
typedef struct {
119+
MonoRefCount ref;
120+
MonoCoopMutex *synch_cs;
121+
} MonoLongLivedThreadData;
122+
109123
void mono_threads_install_cleanup (MonoThreadCleanupFunc func);
110124

111125
ICALL_EXPORT

mono/metadata/threads.c

Lines changed: 43 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,6 @@ static GHashTable *contexts = NULL;
154154
/* Cleanup queue for contexts. */
155155
static MonoReferenceQueue *context_queue;
156156

157-
/* Cleanup queue for threads. */
158-
static MonoReferenceQueue *thread_queue;
159-
160157
/*
161158
* Threads which are starting up and they are not in the 'threads' hash yet.
162159
* When mono_thread_attach_internal is called for a thread, it will be removed from this hash table.
@@ -472,53 +469,56 @@ thread_get_tid (MonoInternalThread *thread)
472469
}
473470

474471
static void
475-
free_synch_cs (void *user_data)
472+
free_synch_cs (MonoCoopMutex *synch_cs)
476473
{
477-
MonoCoopMutex *synch_cs = (MonoCoopMutex*)user_data;
478474
g_assert (synch_cs);
479475
mono_coop_mutex_destroy (synch_cs);
480476
g_free (synch_cs);
481477
}
482478

483479
static void
484-
ensure_synch_cs_set (MonoInternalThread *thread)
480+
free_longlived_thread_data (void *user_data)
485481
{
486-
MonoCoopMutex *synch_cs;
482+
MonoLongLivedThreadData *lltd = (MonoLongLivedThreadData*)user_data;
483+
free_synch_cs (lltd->synch_cs);
487484

488-
if (thread->synch_cs != NULL) {
489-
return;
490-
}
485+
g_free (lltd);
486+
}
487+
488+
static void
489+
init_longlived_thread_data (MonoLongLivedThreadData *lltd)
490+
{
491+
mono_refcount_init (lltd, free_longlived_thread_data);
492+
mono_refcount_inc (lltd);
493+
/* Initial refcount is 2: decremented once by
494+
* mono_thread_detach_internal and once by the MonoInternalThread
495+
* finalizer - whichever one happens later will deallocate. */
491496

492-
synch_cs = g_new0 (MonoCoopMutex, 1);
493-
mono_coop_mutex_init_recursive (synch_cs);
497+
lltd->synch_cs = g_new0 (MonoCoopMutex, 1);
498+
mono_coop_mutex_init_recursive (lltd->synch_cs);
494499

495-
if (mono_atomic_cas_ptr ((gpointer *)&thread->synch_cs,
496-
synch_cs, NULL) != NULL) {
497-
/* Another thread must have installed this CS */
498-
mono_coop_mutex_destroy (synch_cs);
499-
g_free (synch_cs);
500-
} else {
501-
// If we were the ones to initialize with this synch_cs variable, we
502-
// should associate this one with our cleanup
503-
mono_gc_reference_queue_add_internal (thread_queue, &thread->obj, synch_cs);
504-
}
500+
mono_memory_barrier ();
501+
}
502+
503+
static void
504+
dec_longlived_thread_data (MonoLongLivedThreadData *lltd)
505+
{
506+
mono_refcount_dec (lltd);
505507
}
506508

507509
static inline void
508510
lock_thread (MonoInternalThread *thread)
509511
{
510-
if (!thread->synch_cs)
511-
ensure_synch_cs_set (thread);
512-
513-
g_assert (thread->synch_cs);
512+
g_assert (thread->longlived);
513+
g_assert (thread->longlived->synch_cs);
514514

515-
mono_coop_mutex_lock (thread->synch_cs);
515+
mono_coop_mutex_lock (thread->longlived->synch_cs);
516516
}
517517

518518
static inline void
519519
unlock_thread (MonoInternalThread *thread)
520520
{
521-
mono_coop_mutex_unlock (thread->synch_cs);
521+
mono_coop_mutex_unlock (thread->longlived->synch_cs);
522522
}
523523

524524
static void
@@ -673,7 +673,8 @@ create_internal_thread_object (void)
673673
/* only possible failure mode is OOM, from which we don't exect to recover */
674674
mono_error_assert_ok (error);
675675

676-
ensure_synch_cs_set (thread);
676+
thread->longlived = g_new0 (MonoLongLivedThreadData, 1);
677+
init_longlived_thread_data (thread->longlived);
677678

678679
thread->apartment_state = ThreadApartmentState_Unknown;
679680
thread->managed_id = get_next_managed_thread_id ();
@@ -942,20 +943,12 @@ mono_thread_detach_internal (MonoInternalThread *thread)
942943
thread->abort_exc = NULL;
943944
thread->current_appcontext = NULL;
944945

945-
/*
946-
* This should be alive until after the reference queue runs the
947-
* post-free cleanup function
948-
*/
949-
while (TRUE) {
950-
guint32 old_state = thread->state;
946+
LOCK_THREAD (thread);
951947

952-
guint32 new_state = old_state;
953-
new_state |= ThreadState_Stopped;
954-
new_state &= ~ThreadState_Background;
948+
thread->state |= ThreadState_Stopped;
949+
thread->state &= ~ThreadState_Background;
955950

956-
if (mono_atomic_cas_i32 ((gint32 *)&thread->state, new_state, old_state) == old_state)
957-
break;
958-
}
951+
UNLOCK_THREAD (thread);
959952

960953
/*
961954
An interruption request has leaked to cleanup. Adjust the global counter.
@@ -1049,6 +1042,10 @@ mono_thread_detach_internal (MonoInternalThread *thread)
10491042

10501043
mono_thread_info_unset_internal_thread_gchandle (info);
10511044

1045+
/* Possibly free synch_cs, if the finalizer for InternalThread already
1046+
* ran also. */
1047+
dec_longlived_thread_data (thread->longlived);
1048+
10521049
MONO_PROFILER_RAISE (thread_exited, (thread->tid));
10531050

10541051
/* Don't need to close the handle to this thread, even though we took a
@@ -1666,9 +1663,9 @@ ves_icall_System_Threading_InternalThread_Thread_free_internal (MonoInternalThre
16661663
CloseHandle (this_obj->native_handle);
16671664
#endif
16681665

1669-
// Taken care of by reference queue, but we should
1670-
// zero it out
1671-
this_obj->synch_cs = NULL;
1666+
/* Possibly free synch_cs, if the thread already detached also. */
1667+
dec_longlived_thread_data (this_obj->longlived);
1668+
16721669

16731670
if (this_obj->name) {
16741671
void *name = this_obj->name;
@@ -3253,7 +3250,6 @@ void mono_thread_init (MonoThreadStartCB start_cb,
32533250
mono_thread_start_cb = start_cb;
32543251
mono_thread_attach_cb = attach_cb;
32553252

3256-
thread_queue = mono_gc_reference_queue_new_internal (free_synch_cs);
32573253
}
32583254

32593255
static gpointer
@@ -5497,7 +5493,7 @@ async_suspend_critical (MonoThreadInfo *info, gpointer ud)
54975493
}
54985494
}
54995495

5500-
/* LOCKING: called with @thread synch_cs held, and releases it */
5496+
/* LOCKING: called with @thread longlived->synch_cs held, and releases it */
55015497
static void
55025498
async_suspend_internal (MonoInternalThread *thread, gboolean interrupt)
55035499
{
@@ -5520,7 +5516,7 @@ async_suspend_internal (MonoInternalThread *thread, gboolean interrupt)
55205516
UNLOCK_THREAD (thread);
55215517
}
55225518

5523-
/* LOCKING: called with @thread synch_cs held, and releases it */
5519+
/* LOCKING: called with @thread longlived->synch_cs held, and releases it */
55245520
static void
55255521
self_suspend_internal (void)
55265522
{

0 commit comments

Comments
 (0)