From cf413a445f77727e2137b2d8f1943bfabf074b2c Mon Sep 17 00:00:00 2001
From: Jakob Bornecrantz <jakob@collabora.com>
Date: Wed, 30 Mar 2022 18:12:41 +0100
Subject: [PATCH] c/multi: Add per client thread that waits for sync objects

---
 .../compositor/multi/comp_multi_compositor.c  | 331 +++++++++++++-----
 src/xrt/compositor/multi/comp_multi_private.h |  24 ++
 2 files changed, 266 insertions(+), 89 deletions(-)

diff --git a/src/xrt/compositor/multi/comp_multi_compositor.c b/src/xrt/compositor/multi/comp_multi_compositor.c
index 02e59a939..2d0e59f7b 100644
--- a/src/xrt/compositor/multi/comp_multi_compositor.c
+++ b/src/xrt/compositor/multi/comp_multi_compositor.c
@@ -118,6 +118,218 @@ drain_events(struct multi_compositor *mc)
 }
 
 
+/*
+ *
+ * Wait helper thread.
+ *
+ */
+
+static void
+wait_fence(struct xrt_compositor_fence **xcf_ptr)
+{
+	COMP_TRACE_MARKER();
+	xrt_result_t ret = XRT_SUCCESS;
+
+	// 100ms
+	uint64_t timeout_ns = 100 * U_TIME_1MS_IN_NS;
+
+	do {
+		ret = xrt_compositor_fence_wait(*xcf_ptr, timeout_ns);
+		if (ret != XRT_TIMEOUT) {
+			break;
+		}
+
+		U_LOG_W("Waiting on client fence timed out > 100ms!");
+	} while (true);
+
+	xrt_compositor_fence_destroy(xcf_ptr);
+
+	if (ret != XRT_SUCCESS) {
+		U_LOG_E("Fence waiting failed!");
+	}
+}
+
+static void
+wait_semaphore(struct xrt_compositor_semaphore **xcsem_ptr, uint64_t value)
+{
+	COMP_TRACE_MARKER();
+	xrt_result_t ret = XRT_SUCCESS;
+
+	// 100ms
+	uint64_t timeout_ns = 100 * U_TIME_1MS_IN_NS;
+
+	do {
+		ret = xrt_compositor_semaphore_wait(*xcsem_ptr, value, timeout_ns);
+		if (ret != XRT_TIMEOUT) {
+			break;
+		}
+
+		U_LOG_W("Waiting on client semaphore value '%" PRIu64 "' timed out > 100ms!", value);
+	} while (true);
+
+	xrt_compositor_semaphore_reference(xcsem_ptr, NULL);
+}
+
+static void
+wait_for_scheduled_free(struct multi_compositor *mc)
+{
+	COMP_TRACE_MARKER();
+
+	os_mutex_lock(&mc->slot_lock);
+
+	// Block here if the scheduled slot is not clear.
+	while (mc->scheduled.active) {
+
+		// Replace the scheduled frame if it's in the past.
+		uint64_t now_ns = os_monotonic_get_ns();
+		if (mc->scheduled.display_time_ns < now_ns) {
+			break;
+		}
+
+		os_mutex_unlock(&mc->slot_lock);
+
+		os_nanosleep(U_TIME_1MS_IN_NS);
+
+		os_mutex_lock(&mc->slot_lock);
+	}
+
+	slot_move_and_clear(&mc->scheduled, &mc->progress);
+
+	os_mutex_unlock(&mc->slot_lock);
+}
+
+static void *
+run_func(void *ptr)
+{
+	struct multi_compositor *mc = (struct multi_compositor *)ptr;
+
+	os_thread_helper_lock(&mc->wait_thread.oth);
+
+	while (os_thread_helper_is_running_locked(&mc->wait_thread.oth)) {
+
+		if (mc->wait_thread.xcsem == NULL && mc->wait_thread.xcf == NULL) {
+			os_thread_helper_wait_locked(&mc->wait_thread.oth);
+			// Fall through here on stopping to clean up and outstanding waits.
+		}
+
+		int64_t frame_id = mc->wait_thread.frame_id;
+		struct xrt_compositor_fence *xcf = mc->wait_thread.xcf;
+		struct xrt_compositor_semaphore *xcsem = mc->wait_thread.xcsem; // No need to ref, a move.
+		uint64_t value = mc->wait_thread.value;
+
+		mc->wait_thread.frame_id = 0;
+		mc->wait_thread.xcf = NULL;
+		mc->wait_thread.xcsem = NULL;
+		mc->wait_thread.value = 0;
+
+		// We are being stopped, loop back and check running.
+		if (xcf == NULL && xcsem == NULL) {
+			continue;
+		}
+
+		// We now know that we should wait.
+		mc->wait_thread.waiting = true;
+
+		os_thread_helper_unlock(&mc->wait_thread.oth);
+
+		if (xcsem != NULL) {
+			wait_semaphore(&xcsem, value);
+		}
+		if (xcf != NULL) {
+			wait_fence(&xcf);
+		}
+
+		// Sample time outside of lock.
+		uint64_t now_ns = os_monotonic_get_ns();
+
+		os_mutex_lock(&mc->msc->list_and_timing_lock);
+		u_pa_mark_gpu_done(mc->upa, frame_id, now_ns);
+		os_mutex_unlock(&mc->msc->list_and_timing_lock);
+
+		// Wait for the delivery slot.
+		wait_for_scheduled_free(mc);
+
+		os_thread_helper_lock(&mc->wait_thread.oth);
+
+		// Finally no longer waiting.
+		//! @todo Move to before wait_for_scheduled_free?
+		mc->wait_thread.waiting = false;
+
+		if (mc->wait_thread.blocked) {
+			os_thread_helper_signal_locked(&mc->wait_thread.oth);
+		}
+	}
+
+	os_thread_helper_unlock(&mc->wait_thread.oth);
+
+	return NULL;
+}
+
+static void
+wait_for_wait_thread_locked(struct multi_compositor *mc)
+{
+	// Should we wait for the last frame.
+	if (mc->wait_thread.waiting) {
+		COMP_TRACE_IDENT(blocked);
+		mc->wait_thread.blocked = true;
+		os_thread_helper_wait_locked(&mc->wait_thread.oth);
+		mc->wait_thread.blocked = false;
+	}
+}
+
+static void
+wait_for_wait_thread(struct multi_compositor *mc)
+{
+	os_thread_helper_lock(&mc->wait_thread.oth);
+
+	wait_for_wait_thread_locked(mc);
+
+	os_thread_helper_unlock(&mc->wait_thread.oth);
+}
+
+static void
+push_fence_to_wait_thread(struct multi_compositor *mc, int64_t frame_id, struct xrt_compositor_fence *xcf)
+{
+	os_thread_helper_lock(&mc->wait_thread.oth);
+
+	// The function begin_layer should have waited, but just in case.
+	assert(!mc->wait_thread.waiting);
+	wait_for_wait_thread_locked(mc);
+
+	assert(mc->wait_thread.xcf == NULL);
+
+	mc->wait_thread.frame_id = frame_id;
+	mc->wait_thread.xcf = xcf;
+
+	os_thread_helper_signal_locked(&mc->wait_thread.oth);
+
+	os_thread_helper_unlock(&mc->wait_thread.oth);
+}
+
+static void
+push_semaphore_to_wait_thread(struct multi_compositor *mc,
+                              int64_t frame_id,
+                              struct xrt_compositor_semaphore *xcsem,
+                              uint64_t value)
+{
+	os_thread_helper_lock(&mc->wait_thread.oth);
+
+	// The function begin_layer should have waited, but just in case.
+	assert(!mc->wait_thread.waiting);
+	wait_for_wait_thread_locked(mc);
+
+	assert(mc->wait_thread.xcsem == NULL);
+
+	mc->wait_thread.frame_id = frame_id;
+	xrt_compositor_semaphore_reference(&mc->wait_thread.xcsem, xcsem);
+	mc->wait_thread.value = value;
+
+	os_thread_helper_signal_locked(&mc->wait_thread.oth);
+
+	os_thread_helper_unlock(&mc->wait_thread.oth);
+}
+
+
 /*
  *
  * Compositor functions.
@@ -330,6 +542,18 @@ multi_compositor_layer_begin(struct xrt_compositor *xc,
 	u_pa_mark_delivered(mc->upa, frame_id, now_ns, display_time_ns);
 	os_mutex_unlock(&mc->msc->list_and_timing_lock);
 
+	/*
+	 * We have to block here for the waiting thread to push the last
+	 * submitted frame from the progress slot to the scheduled slot,
+	 * it only does after the sync object has signaled completion.
+	 *
+	 * If the previous frame's GPU work has not completed that means we
+	 * will block here, but that is okay as the app has already submitted
+	 * the GPU for this frame. This should have very little impact on GPU
+	 * utilisation, if any.
+	 */
+	wait_for_wait_thread(mc);
+
 	assert(mc->progress.layer_count == 0);
 	U_ZERO(&mc->progress);
 
@@ -461,59 +685,6 @@ multi_compositor_layer_equirect2(struct xrt_compositor *xc,
 	return XRT_SUCCESS;
 }
 
-static void
-wait_fence(struct xrt_compositor_fence **xcf_ptr)
-{
-	COMP_TRACE_MARKER();
-	xrt_result_t ret = XRT_SUCCESS;
-
-	// 100ms
-	uint64_t timeout_ns = 100 * U_TIME_1MS_IN_NS;
-
-	do {
-		ret = xrt_compositor_fence_wait(*xcf_ptr, timeout_ns);
-		if (ret != XRT_TIMEOUT) {
-			break;
-		}
-
-		U_LOG_W("Waiting on client fence timed out > 100ms!");
-	} while (true);
-
-	xrt_compositor_fence_destroy(xcf_ptr);
-
-	if (ret != XRT_SUCCESS) {
-		U_LOG_E("Fence waiting failed!");
-	}
-}
-
-static void
-wait_for_scheduled_free(struct multi_compositor *mc)
-{
-	COMP_TRACE_MARKER();
-
-	os_mutex_lock(&mc->slot_lock);
-
-	// Block here if the scheduled slot is not clear.
-	while (mc->scheduled.active) {
-
-		// Replace the scheduled frame if it's in the past.
-		uint64_t now_ns = os_monotonic_get_ns();
-		if (mc->scheduled.display_time_ns < now_ns) {
-			break;
-		}
-
-		os_mutex_unlock(&mc->slot_lock);
-
-		os_nanosleep(U_TIME_1MS_IN_NS);
-
-		os_mutex_lock(&mc->slot_lock);
-	}
-
-	slot_move_and_clear(&mc->scheduled, &mc->progress);
-
-	os_mutex_unlock(&mc->slot_lock);
-}
-
 static xrt_result_t
 multi_compositor_layer_commit(struct xrt_compositor *xc, int64_t frame_id, xrt_graphics_sync_handle_t sync_handle)
 {
@@ -544,38 +715,21 @@ multi_compositor_layer_commit(struct xrt_compositor *xc, int64_t frame_id, xrt_g
 	} while (false); // Goto without the labels.
 
 	if (xcf != NULL) {
-		wait_fence(&xcf);
+		push_fence_to_wait_thread(mc, frame_id, xcf);
+	} else {
+		// Assume that the app side compositor waited.
+		uint64_t now_ns = os_monotonic_get_ns();
+
+		os_mutex_lock(&mc->msc->list_and_timing_lock);
+		u_pa_mark_gpu_done(mc->upa, frame_id, now_ns);
+		os_mutex_unlock(&mc->msc->list_and_timing_lock);
+
+		wait_for_scheduled_free(mc);
 	}
 
-	wait_for_scheduled_free(mc);
-	uint64_t now_ns = os_monotonic_get_ns();
-
-	os_mutex_lock(&mc->msc->list_and_timing_lock);
-	u_pa_mark_gpu_done(mc->upa, frame_id, now_ns);
-	os_mutex_unlock(&mc->msc->list_and_timing_lock);
-
 	return XRT_SUCCESS;
 }
 
-static void
-wait_semaphore(struct xrt_compositor_semaphore *xcsem, uint64_t value)
-{
-	COMP_TRACE_MARKER();
-	xrt_result_t ret = XRT_SUCCESS;
-
-	// 100ms
-	uint64_t timeout_ns = 100 * U_TIME_1MS_IN_NS;
-
-	do {
-		ret = xrt_compositor_semaphore_wait(xcsem, value, timeout_ns);
-		if (ret != XRT_TIMEOUT) {
-			break;
-		}
-
-		U_LOG_W("Waiting on client semaphore value '%" PRIu64 "' timed out > 100ms!", value);
-	} while (true);
-}
-
 static xrt_result_t
 multi_compositor_layer_commit_with_semaphore(struct xrt_compositor *xc,
                                              int64_t frame_id,
@@ -586,15 +740,7 @@ multi_compositor_layer_commit_with_semaphore(struct xrt_compositor *xc,
 
 	struct multi_compositor *mc = multi_compositor(xc);
 
-	// Wait for the semaphore.
-	wait_semaphore(xcsem, value);
-
-	wait_for_scheduled_free(mc);
-	uint64_t now_ns = os_monotonic_get_ns();
-
-	os_mutex_lock(&mc->msc->list_and_timing_lock);
-	u_pa_mark_gpu_done(mc->upa, frame_id, now_ns);
-	os_mutex_unlock(&mc->msc->list_and_timing_lock);
+	push_semaphore_to_wait_thread(mc, frame_id, xcsem, value);
 
 	return XRT_SUCCESS;
 }
@@ -631,6 +777,9 @@ multi_compositor_destroy(struct xrt_compositor *xc)
 
 	drain_events(mc);
 
+	// Stop the wait thread.
+	os_thread_helper_stop(&mc->wait_thread.oth);
+
 	// We are now off the rendering list, clear slots for any swapchains.
 	slot_clear(&mc->progress);
 	slot_clear(&mc->scheduled);
@@ -701,6 +850,7 @@ multi_compositor_create(struct multi_system_compositor *msc,
 
 	os_mutex_init(&mc->event.mutex);
 	os_mutex_init(&mc->slot_lock);
+	os_thread_helper_init(&mc->wait_thread.oth);
 
 	// Passthrough our formats from the native compositor to the client.
 	mc->base.base.info = msc->xcn->base.info;
@@ -730,6 +880,9 @@ multi_compositor_create(struct multi_system_compositor *msc,
 
 	os_mutex_unlock(&msc->list_and_timing_lock);
 
+	// Last start the wait thread.
+	os_thread_helper_start(&mc->wait_thread.oth, run_func, mc);
+
 	*out_xcn = &mc->base;
 
 	return XRT_SUCCESS;
diff --git a/src/xrt/compositor/multi/comp_multi_private.h b/src/xrt/compositor/multi/comp_multi_private.h
index 03d061cd9..a67522bf5 100644
--- a/src/xrt/compositor/multi/comp_multi_private.h
+++ b/src/xrt/compositor/multi/comp_multi_private.h
@@ -126,6 +126,30 @@ struct multi_compositor
 		int64_t z_order;
 	} state;
 
+	struct
+	{
+		//! Fence to wait for.
+		struct xrt_compositor_fence *xcf;
+
+		//! Timeline semaphore to wait for.
+		struct xrt_compositor_semaphore *xcsem;
+
+		//! Timeline semaphore value to wait for.
+		uint64_t value;
+
+		//! Frame id of frame being waited on.
+		int64_t frame_id;
+
+		//! The wait thread itself
+		struct os_thread_helper oth;
+
+		//! Is the thread waiting, if so the client should block.
+		bool waiting;
+
+		//! Is the client thread blocked, if so it should be woken up.
+		bool blocked;
+	} wait_thread;
+
 	//! Lock for all of the slots.
 	struct os_mutex slot_lock;