[threads] Use refcounts for coordinating finalization and detaching

lambdageek · lambdageek · commit 67c5a87acebd · 2019-01-14T19:29:09.000-05:00
Reverts a29ad08 (mono#9914) The basic problem we want to solve is the following: 1. All access to InternalThread:state must be protected by the InternalThread:synch_cs mutex 2. We must destroy the mutex when we are done with the thread. 3. We don't know which happens later - detaching the machine thread or finalizing its InternalThread managed object. The solution is to replace InternalThread:synch_cs by InternalThread:longlived which is a refcounted struct that holds the synch_cs. The refcount starts out at 2 when the thread is attached to the runtime and when we create the managed InternalThread object that represents it. Both detaching and finalizing the managed object will decrement the refounct, and whichever one happens last will be responsible for destroying the mutex. This addresses mono#11956 which was a race condition due to the previous attempt to fix this lifetime problem. The previous attempt incorrectly used CAS in mono_thread_detach_internal while continuing to use locking of synch_cs elsewhere. In particular mono_thread_suspend_all_other_threads could race with mono_thread_detach_internal: it expects to take the thread lock and test thread->state and use the thread->suspended event, while detaching deletes thread->suspended without taking a lock. As a result we had a concurrency bug: in suspend_all_other_threads it's possible to see both the old (non-Stopped) value of thread->state and the new (NULL) value of thread->suspended. Which leads to crashes. --- Background - why we don't know if detaching or finalization happens first. 1. InternalThread normally outlives the machine thread. This can happen because when one thread starts another it can hold a reference to the fresh thread's Thread object which holds a reference to the InternalThread. So after the machine thread is done, the older thread can query the state of the younger Thread object. This is the normal situation. 2. During shutdown we can have the opposite situation: the InternalThread objects are finalized first (this happens during root domain finalization), but the machine threads are still running, and they may still return to start_wrapper_internal and call detach_internal. So in this case we have an InternalThread whose finalizer ran first and detach will run second.
diff --git a/mcs/class/corlib/System.Threading/Thread.cs b/mcs/class/corlib/System.Threading/Thread.cs
@@ -71,7 +71,7 @@ keep as an object to avoid triggering its class constructor when not needed */
 		internal int _serialized_principal_version;
 		private IntPtr appdomain_refs;
 		private int interruption_requested;
-		private IntPtr synch_cs;
+		private IntPtr longlived;
 		internal bool threadpool_thread;
 		private bool thread_interrupt_requested;
 		/* These are used from managed code */
diff --git a/mono/metadata/object-internals.h b/mono/metadata/object-internals.h
@@ -510,7 +510,7 @@ struct _MonoInternalThread {
 	gpointer unused3;
 	gunichar2  *name;
 	guint32	    name_len;
-	guint32	    state;
+	guint32	    state;      /* must be accessed while longlived->synch_cs is locked */
 	MonoException *abort_exc;
 	int abort_state_handle;
 	guint64 tid;	/* This is accessed as a gsize in the code (so it can hold a 64bit pointer on systems that need it), but needs to reserve 64 bits of space on all machines as it corresponds to a field in managed code */
@@ -524,7 +524,10 @@ struct _MonoInternalThread {
 	gpointer appdomain_refs;
 	/* This is modified using atomic ops, so keep it a gint32 */
 	gint32 __interruption_requested;
-	MonoCoopMutex *synch_cs;
+	/* data that must live as long as this managed object is not finalized
+	 * or as long as the underlying thread is attached, whichever is
+	 * longer */
+	MonoLongLivedThreadData *longlived;
 	MonoBoolean threadpool_thread;
 	MonoBoolean thread_interrupt_requested;
 	int stack_size;
diff --git a/mono/metadata/threads-types.h b/mono/metadata/threads-types.h
@@ -106,6 +106,20 @@ mono_thread_create_internal_handle (MonoDomain *domain, T func, gpointer arg, Mo
 }
 #endif
 
+/* Data owned by a MonoInternalThread that must live until both the finalizer
+ * for MonoInternalThread has run, and the underlying machine thread has
+ * detached.
+ *
+ * Normally a thread is first detached and then the InternalThread object is
+ * finalized and collected.  However during shutdown, when the root domain is
+ * finalized, all the InternalThread objects are finalized first and the
+ * machine threads are detached later.
+ */
+typedef struct {
+  MonoRefCount ref;
+  MonoCoopMutex *synch_cs;
+} MonoLongLivedThreadData;
+
 void mono_threads_install_cleanup (MonoThreadCleanupFunc func);
 
 ICALL_EXPORT
diff --git a/mono/metadata/threads.c b/mono/metadata/threads.c
@@ -154,9 +154,6 @@ static GHashTable *contexts = NULL;
 /* Cleanup queue for contexts. */
 static MonoReferenceQueue *context_queue;
 
-/* Cleanup queue for threads. */
-static MonoReferenceQueue *thread_queue;
-
 /*
  * Threads which are starting up and they are not in the 'threads' hash yet.
  * When mono_thread_attach_internal is called for a thread, it will be removed from this hash table.
@@ -472,53 +469,56 @@ thread_get_tid (MonoInternalThread *thread)
 }
 
 static void
-free_synch_cs (void *user_data)
+free_synch_cs (MonoCoopMutex *synch_cs)
 {
-	MonoCoopMutex *synch_cs = (MonoCoopMutex*)user_data;
 	g_assert (synch_cs);
 	mono_coop_mutex_destroy (synch_cs);
 	g_free (synch_cs);
 }
 
 static void
-ensure_synch_cs_set (MonoInternalThread *thread)
+free_longlived_thread_data (void *user_data)
 {
-	MonoCoopMutex *synch_cs;
+	MonoLongLivedThreadData *lltd = (MonoLongLivedThreadData*)user_data;
+	free_synch_cs (lltd->synch_cs);
 
-	if (thread->synch_cs != NULL) {
-		return;
-	}
+	g_free (lltd);
+}
+
+static void
+init_longlived_thread_data (MonoLongLivedThreadData *lltd)
+{
+	mono_refcount_init (lltd, free_longlived_thread_data);
+	mono_refcount_inc (lltd);
+	/* Initial refcount is 2: decremented once by
+	 * mono_thread_detach_internal and once by the MonoInternalThread
+	 * finalizer - whichever one happens later will deallocate. */
 
-	synch_cs = g_new0 (MonoCoopMutex, 1);
-	mono_coop_mutex_init_recursive (synch_cs);
+	lltd->synch_cs = g_new0 (MonoCoopMutex, 1);
+	mono_coop_mutex_init_recursive (lltd->synch_cs);
 
-	if (mono_atomic_cas_ptr ((gpointer *)&thread->synch_cs,
-					       synch_cs, NULL) != NULL) {
-		/* Another thread must have installed this CS */
-		mono_coop_mutex_destroy (synch_cs);
-		g_free (synch_cs);
-	} else {
-		// If we were the ones to initialize with this synch_cs variable, we
-		// should associate this one with our cleanup
-		mono_gc_reference_queue_add_internal (thread_queue, &thread->obj, synch_cs);
-	}
+	mono_memory_barrier ();
+}
+
+static void
+dec_longlived_thread_data (MonoLongLivedThreadData *lltd)
+{
+	mono_refcount_dec (lltd);
 }
 
 static inline void
 lock_thread (MonoInternalThread *thread)
 {
-	if (!thread->synch_cs)
-		ensure_synch_cs_set (thread);
-
-	g_assert (thread->synch_cs);
+	g_assert (thread->longlived);
+	g_assert (thread->longlived->synch_cs);
 
-	mono_coop_mutex_lock (thread->synch_cs);
+	mono_coop_mutex_lock (thread->longlived->synch_cs);
 }
 
 static inline void
 unlock_thread (MonoInternalThread *thread)
 {
-	mono_coop_mutex_unlock (thread->synch_cs);
+	mono_coop_mutex_unlock (thread->longlived->synch_cs);
 }
 
 static void
@@ -673,7 +673,8 @@ create_internal_thread_object (void)
 	/* only possible failure mode is OOM, from which we don't exect to recover */
 	mono_error_assert_ok (error);
 
-	ensure_synch_cs_set (thread);
+	thread->longlived = g_new0 (MonoLongLivedThreadData, 1);
+	init_longlived_thread_data (thread->longlived);
 
 	thread->apartment_state = ThreadApartmentState_Unknown;
 	thread->managed_id = get_next_managed_thread_id ();
@@ -942,20 +943,12 @@ mono_thread_detach_internal (MonoInternalThread *thread)
 	thread->abort_exc = NULL;
 	thread->current_appcontext = NULL;
 
-	/*
-	 * This should be alive until after the reference queue runs the
-	 * post-free cleanup function
-	 */
-	while (TRUE) {
-		guint32 old_state = thread->state;
+	LOCK_THREAD (thread);
 
-		guint32 new_state = old_state;
-		new_state |= ThreadState_Stopped;
-		new_state &= ~ThreadState_Background;
+	thread->state |= ThreadState_Stopped;
+	thread->state &= ~ThreadState_Background;
 
-		if (mono_atomic_cas_i32 ((gint32 *)&thread->state, new_state, old_state) == old_state)
-			break;
-	}
+	UNLOCK_THREAD (thread);
 
 	/*
 	An interruption request has leaked to cleanup. Adjust the global counter.
@@ -1049,6 +1042,10 @@ mono_thread_detach_internal (MonoInternalThread *thread)
 
 	mono_thread_info_unset_internal_thread_gchandle (info);
 
+	/* Possibly free synch_cs, if the finalizer for InternalThread already
+	 * ran also. */
+	dec_longlived_thread_data (thread->longlived);
+
 	MONO_PROFILER_RAISE (thread_exited, (thread->tid));
 
 	/* Don't need to close the handle to this thread, even though we took a
@@ -1666,9 +1663,9 @@ ves_icall_System_Threading_InternalThread_Thread_free_internal (MonoInternalThre
 	CloseHandle (this_obj->native_handle);
 #endif
 
-	// Taken care of by reference queue, but we should
-	// zero it out
-	this_obj->synch_cs = NULL;
+	/* Possibly free synch_cs, if the thread already detached also. */
+	dec_longlived_thread_data (this_obj->longlived);
+
 
 	if (this_obj->name) {
 		void *name = this_obj->name;
@@ -3253,7 +3250,6 @@ void mono_thread_init (MonoThreadStartCB start_cb,
 	mono_thread_start_cb = start_cb;
 	mono_thread_attach_cb = attach_cb;
 
-	thread_queue = mono_gc_reference_queue_new_internal (free_synch_cs);
 }
 
 static gpointer
@@ -5497,7 +5493,7 @@ async_suspend_critical (MonoThreadInfo *info, gpointer ud)
 	}
 }
 
-/* LOCKING: called with @thread synch_cs held, and releases it */
+/* LOCKING: called with @thread longlived->synch_cs held, and releases it */
 static void
 async_suspend_internal (MonoInternalThread *thread, gboolean interrupt)
 {
@@ -5520,7 +5516,7 @@ async_suspend_internal (MonoInternalThread *thread, gboolean interrupt)
 	UNLOCK_THREAD (thread);
 }
 
-/* LOCKING: called with @thread synch_cs held, and releases it */
+/* LOCKING: called with @thread longlived->synch_cs held, and releases it */
 static void
 self_suspend_internal (void)
 {