monado/src/xrt/auxiliary/util/u_pacing_app.c

// Copyright 2020-2022, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
 * @file
 * @brief  Shared frame timing code.
 * @author Jakob Bornecrantz <jakob@collabora.com>
 * @ingroup aux_util
 */

#include "os/os_time.h"

#include "util/u_time.h"
#include "util/u_misc.h"
#include "util/u_debug.h"
#include "util/u_pacing.h"
#include "util/u_metrics.h"
#include "util/u_logging.h"
#include "util/u_trace_marker.h"

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>

DEBUG_GET_ONCE_LOG_OPTION(log_level, "U_PACING_APP_LOG", U_LOGGING_WARN)

#define UPA_LOG_T(...) U_LOG_IFL_T(debug_get_log_option_log_level(), __VA_ARGS__)
#define UPA_LOG_D(...) U_LOG_IFL_D(debug_get_log_option_log_level(), __VA_ARGS__)
#define UPA_LOG_I(...) U_LOG_IFL_I(debug_get_log_option_log_level(), __VA_ARGS__)
#define UPA_LOG_W(...) U_LOG_IFL_W(debug_get_log_option_log_level(), __VA_ARGS__)
#define UPA_LOG_E(...) U_LOG_IFL_E(debug_get_log_option_log_level(), __VA_ARGS__)

/*!
 * Define to validate latched and retired call. Currently disabled due to
 * simplistic frame allocation code, enable once improved.
 */
#undef VALIDATE_LATCHED_AND_RETIRED


/*
 *
 * Structs enums, and defines.
 *
 */

/*!
 * This controls how many frames are in the allocation array.
 *
 * @todo The allocation code is not good, this is a work around for index reuse
 *       causing asserts, change the code so we don't need it at all.
 */
#define FRAME_COUNT (128)

enum u_pa_state
{
	U_PA_READY,
	U_RT_WAIT_LEFT,
	U_RT_PREDICTED,
	U_RT_BEGUN,
	U_RT_DELIVERED,
	U_RT_GPU_DONE,
};

struct u_pa_frame
{
	int64_t frame_id;

	//! How long we thought the frame would take.
	uint64_t predicted_frame_time_ns;

	//! When we predicted the app should wake up.
	uint64_t predicted_wake_up_time_ns;

	//! When the client's GPU work should have completed.
	uint64_t predicted_gpu_done_time_ns;

	//! When we predicted this frame to be shown.
	uint64_t predicted_display_time_ns;

	//! The selected display period.
	uint64_t predicted_display_period_ns;

	/*!
	 * When the app told us to display this frame, can be different
	 * then the predicted display time so we track that separately.
	 */
	uint64_t display_time_ns;

	//! When something happened.
	struct
	{
		uint64_t predicted_ns;
		uint64_t wait_woke_ns;
		uint64_t begin_ns;
		uint64_t delivered_ns;
		uint64_t gpu_done_ns;
	} when;

	enum u_pa_state state;
};

struct pacing_app
{
	struct u_pacing_app base;

	//! Id for this session.
	int64_t session_id;

	struct u_pa_frame frames[FRAME_COUNT];
	uint32_t current_frame;
	uint32_t next_frame;

	int64_t frame_counter;

	struct
	{
		//! App time between wait returning and begin being called.
		uint64_t cpu_time_ns;
		//! Time between begin and frame data being delivered.
		uint64_t draw_time_ns;
		//! Time between the frame data being delivered and GPU completing.
		uint64_t wait_time_ns;
		//! Extra time between end of draw time and when the compositor wakes up.
		uint64_t margin_ns;
	} app; //!< App statistics.

	struct
	{
		//! The last display time that the thing driving this helper got.
		uint64_t predicted_display_time_ns;
		//! The last display period the hardware is running at.
		uint64_t predicted_display_period_ns;
		//! The extra time needed by the thing driving this helper.
		uint64_t extra_ns;
	} last_input;

	uint64_t last_returned_ns;
};


/*
 *
 * Helpers
 *
 */

static inline struct pacing_app *
pacing_app(struct u_pacing_app *upa)
{
	return (struct pacing_app *)upa;
}

#define DEBUG_PRINT_FRAME_ID() UPA_LOG_T("%" PRIi64, frame_id)
#define GET_INDEX_FROM_ID(RT, ID) ((uint64_t)(ID) % FRAME_COUNT)

#define IIR_ALPHA_LT 0.8
#define IIR_ALPHA_GT 0.8

static void
do_iir_filter(uint64_t *target, double alpha_lt, double alpha_gt, uint64_t sample)
{
	uint64_t t = *target;
	double alpha = t < sample ? alpha_lt : alpha_gt;
	double a = time_ns_to_s(t) * alpha;
	double b = time_ns_to_s(sample) * (1.0 - alpha);
	*target = time_s_to_ns(a + b);
}

static uint64_t
min_period(const struct pacing_app *pa)
{
	return pa->last_input.predicted_display_period_ns;
}

static uint64_t
last_sample_displayed(const struct pacing_app *pa)
{
	return pa->last_input.predicted_display_time_ns;
}

static uint64_t
last_return_predicted_display(const struct pacing_app *pa)
{
	return pa->last_returned_ns;
}

static uint64_t
total_app_time_ns(const struct pacing_app *pa)
{
	return pa->app.cpu_time_ns + pa->app.draw_time_ns + pa->app.wait_time_ns;
}

static uint64_t
total_compositor_time_ns(const struct pacing_app *pa)
{
	return pa->app.margin_ns + pa->last_input.extra_ns;
}

static uint64_t
total_app_and_compositor_time_ns(const struct pacing_app *pa)
{
	return total_app_time_ns(pa) + total_compositor_time_ns(pa);
}

static uint64_t
calc_period(const struct pacing_app *pa)
{
	// Error checking.
	uint64_t base_period_ns = min_period(pa);
	if (base_period_ns == 0) {
		assert(false && "Have not yet received and samples from timing driver.");
		base_period_ns = U_TIME_1MS_IN_NS * 16; // Sure
	}

	// Calculate the using both values separately.
	uint64_t period_ns = base_period_ns;
	while (pa->app.cpu_time_ns > period_ns) {
		period_ns += base_period_ns;
	}

	while (pa->app.draw_time_ns > period_ns) {
		period_ns += base_period_ns;
	}

	while (pa->app.wait_time_ns > period_ns) {
		period_ns += base_period_ns;
	}

	return period_ns;
}

static uint64_t
predict_display_time(const struct pacing_app *pa, uint64_t now_ns, uint64_t period_ns)
{

	// Total app and compositor time to produce a frame
	uint64_t app_and_compositor_time_ns = total_app_and_compositor_time_ns(pa);

	// Start from the last time that the driver displayed something.
	uint64_t val = last_sample_displayed(pa);

	// Return a time after the last returned display time. Add half the
	// display period to the comparison for robustness when the last display
	// time shifts slightly with respect to the last sample.
	while (val <= last_return_predicted_display(pa) + (period_ns / 2)) {
		val += period_ns;
	}

	// Have to have enough time to perform app work.
	while ((val - app_and_compositor_time_ns) <= now_ns) {
		val += period_ns;
	}

	return val;
}


/*
 *
 * Metrics and tracing.
 *
 */

static void
do_metrics(struct pacing_app *pa, struct u_pa_frame *f, bool discarded)
{
	if (!u_metrics_is_active()) {
		return;
	}

	struct u_metrics_session_frame umsf = {
	    .session_id = pa->session_id,
	    .frame_id = f->frame_id,
	    .predicted_frame_time_ns = f->predicted_frame_time_ns,
	    .predicted_wake_up_time_ns = f->predicted_wake_up_time_ns,
	    .predicted_gpu_done_time_ns = f->predicted_gpu_done_time_ns,
	    .predicted_display_time_ns = f->predicted_display_time_ns,
	    .predicted_display_period_ns = f->predicted_display_period_ns,
	    .display_time_ns = f->display_time_ns,
	    .when_predicted_ns = f->when.predicted_ns,
	    .when_wait_woke_ns = f->when.wait_woke_ns,
	    .when_begin_ns = f->when.begin_ns,
	    .when_delivered_ns = f->when.delivered_ns,
	    .when_gpu_done_ns = f->when.gpu_done_ns,
	    .discarded = discarded,
	};

	u_metrics_write_session_frame(&umsf);
}

static void
do_tracing(struct pacing_app *pa, struct u_pa_frame *f)
{
	if (!U_TRACE_CATEGORY_IS_ENABLED(timing)) {
		return;
	}

#ifdef U_TRACE_TRACY // Uses Tracy specific things.
	uint64_t cpu_ns = f->when.begin_ns - f->when.wait_woke_ns;
	TracyCPlot("App CPU(ms)", time_ns_to_ms_f(cpu_ns));

	uint64_t draw_ns = f->when.delivered_ns - f->when.begin_ns;
	TracyCPlot("App Draw(ms)", time_ns_to_ms_f(draw_ns));

	uint64_t gpu_ns = f->when.gpu_done_ns - f->when.delivered_ns;
	TracyCPlot("App GPU(ms)", time_ns_to_ms_f(gpu_ns));

	uint64_t frame_ns = f->when.gpu_done_ns - f->when.wait_woke_ns;
	TracyCPlot("App Frame(ms)", time_ns_to_ms_f(frame_ns));

	int64_t wake_diff_ns = (int64_t)f->when.wait_woke_ns - (int64_t)f->predicted_wake_up_time_ns;
	TracyCPlot("App Wake Diff(ms)", time_ns_to_ms_f(wake_diff_ns));

	int64_t gpu_diff_ns = (int64_t)f->when.gpu_done_ns - (int64_t)f->predicted_gpu_done_time_ns;
	TracyCPlot("App Frame Diff(ms)", time_ns_to_ms_f(gpu_diff_ns));
#endif

#ifdef U_TRACE_PERCETTO // Uses Percetto specific things.
#define TE_BEG(TRACK, TIME, NAME) U_TRACE_EVENT_BEGIN_ON_TRACK_DATA(timing, TRACK, TIME, NAME, PERCETTO_I(f->frame_id))
#define TE_END(TRACK, TIME) U_TRACE_EVENT_END_ON_TRACK(timing, TRACK, TIME)

	TE_BEG(pa_cpu, f->when.predicted_ns, "sleep");
	TE_END(pa_cpu, f->when.wait_woke_ns);

	uint64_t cpu_start_ns = f->when.wait_woke_ns + 1;
	TE_BEG(pa_cpu, cpu_start_ns, "cpu");
	TE_END(pa_cpu, f->when.begin_ns);

	TE_BEG(pa_draw, f->when.begin_ns, "draw");
	if (f->when.begin_ns > f->predicted_gpu_done_time_ns) {
		TE_BEG(pa_draw, f->when.begin_ns, "late");
		TE_END(pa_draw, f->when.delivered_ns);
	} else if (f->when.delivered_ns > f->predicted_gpu_done_time_ns) {
		TE_BEG(pa_draw, f->predicted_gpu_done_time_ns, "late");
		TE_END(pa_draw, f->when.delivered_ns);
	}
	TE_END(pa_draw, f->when.delivered_ns);

	TE_BEG(pa_wait, f->when.delivered_ns, "wait");
	if (f->when.delivered_ns > f->predicted_gpu_done_time_ns) {
		TE_BEG(pa_wait, f->when.delivered_ns, "late");
		TE_END(pa_wait, f->when.gpu_done_ns);
	} else if (f->when.delivered_ns > f->predicted_gpu_done_time_ns) {
		TE_BEG(pa_wait, f->predicted_gpu_done_time_ns, "late");
		TE_END(pa_wait, f->when.gpu_done_ns);
	}
	TE_END(pa_wait, f->when.gpu_done_ns);

#undef TE_BEG
#undef TE_END
#endif
}


/*
 *
 * Member functions.
 *
 */

static void
pa_predict(struct u_pacing_app *upa,
           uint64_t now_ns,
           int64_t *out_frame_id,
           uint64_t *out_wake_up_time,
           uint64_t *out_predicted_display_time,
           uint64_t *out_predicted_display_period)
{
	struct pacing_app *pa = pacing_app(upa);

	int64_t frame_id = ++pa->frame_counter;
	*out_frame_id = frame_id;

	DEBUG_PRINT_FRAME_ID();

	uint64_t period_ns = calc_period(pa);
	uint64_t predict_ns = predict_display_time(pa, now_ns, period_ns);
	// How long we think the frame should take.
	uint64_t frame_time_ns = total_app_time_ns(pa);
	// When should the client wake up.
	uint64_t wake_up_time_ns = predict_ns - total_app_and_compositor_time_ns(pa);
	// When the client's GPU work should have completed.
	uint64_t gpu_done_time_ns = predict_ns - total_compositor_time_ns(pa);

	pa->last_returned_ns = predict_ns;

	*out_wake_up_time = wake_up_time_ns;
	*out_predicted_display_time = predict_ns;
	*out_predicted_display_period = period_ns;

	size_t index = GET_INDEX_FROM_ID(pa, frame_id);
	struct u_pa_frame *f = &pa->frames[index];
	assert(f->frame_id == -1);
	assert(f->state == U_PA_READY);

	f->state = U_RT_PREDICTED;
	f->frame_id = frame_id;
	f->predicted_frame_time_ns = frame_time_ns;
	f->predicted_wake_up_time_ns = wake_up_time_ns;
	f->predicted_gpu_done_time_ns = gpu_done_time_ns;
	f->predicted_display_time_ns = predict_ns;
	f->predicted_display_period_ns = period_ns;
	f->when.predicted_ns = now_ns;

#ifdef U_TRACE_TRACY // Uses Tracy specific things.
	TracyCPlot("App time(ms)", time_ns_to_ms_f(total_app_time_ns(pa)));
#endif
}

static void
pa_mark_point(struct u_pacing_app *upa, int64_t frame_id, enum u_timing_point point, uint64_t when_ns)
{
	struct pacing_app *pa = pacing_app(upa);

	UPA_LOG_T("%" PRIi64 " (%u)", frame_id, point);

	size_t index = GET_INDEX_FROM_ID(pa, frame_id);
	struct u_pa_frame *f = &pa->frames[index];
	assert(f->frame_id == frame_id);

	switch (point) {
	case U_TIMING_POINT_WAKE_UP:
		assert(f->state == U_RT_PREDICTED);

		f->when.wait_woke_ns = when_ns;
		f->state = U_RT_WAIT_LEFT;
		break;
	case U_TIMING_POINT_BEGIN:
		assert(f->state == U_RT_WAIT_LEFT);

		f->when.begin_ns = when_ns;
		f->state = U_RT_BEGUN;
		break;
	case U_TIMING_POINT_SUBMIT:
	default: assert(false);
	}
}

static void
pa_mark_discarded(struct u_pacing_app *upa, int64_t frame_id, uint64_t when_ns)
{
	struct pacing_app *pa = pacing_app(upa);

	DEBUG_PRINT_FRAME_ID();

	size_t index = GET_INDEX_FROM_ID(pa, frame_id);
	struct u_pa_frame *f = &pa->frames[index];
	assert(f->frame_id == frame_id);
	assert(f->state == U_RT_WAIT_LEFT || f->state == U_RT_BEGUN);

	// Update all data.
	f->when.delivered_ns = when_ns;

	// Write out metrics data.
	do_metrics(pa, f, true);

	// Reset the frame.
	U_ZERO(f); // Zero for metrics
	f->state = U_PA_READY;
	f->frame_id = -1;
}

static void
pa_mark_delivered(struct u_pacing_app *upa, int64_t frame_id, uint64_t when_ns, uint64_t display_time_ns)
{
	struct pacing_app *pa = pacing_app(upa);

	DEBUG_PRINT_FRAME_ID();

	size_t index = GET_INDEX_FROM_ID(pa, frame_id);
	struct u_pa_frame *f = &pa->frames[index];
	assert(f->frame_id == frame_id);
	assert(f->state == U_RT_BEGUN);

	// Update all data.
	f->when.delivered_ns = when_ns;
	f->display_time_ns = display_time_ns;
	f->state = U_RT_DELIVERED;
}

static void
pa_mark_gpu_done(struct u_pacing_app *upa, int64_t frame_id, uint64_t when_ns)
{
	struct pacing_app *pa = pacing_app(upa);

	DEBUG_PRINT_FRAME_ID();

	size_t index = GET_INDEX_FROM_ID(pa, frame_id);
	struct u_pa_frame *f = &pa->frames[index];
	assert(f->frame_id == frame_id);
	assert(f->state == U_RT_DELIVERED);

	// Update all data.
	f->when.gpu_done_ns = when_ns;
	f->state = U_RT_GPU_DONE;


	/*
	 * Process data.
	 */

	int64_t diff_ns = f->predicted_gpu_done_time_ns - when_ns;
	bool late = false;
	if (diff_ns < 0) {
		diff_ns = -diff_ns;
		late = true;
	}

	uint64_t diff_cpu_ns = f->when.begin_ns - f->when.wait_woke_ns;
	uint64_t diff_draw_ns = f->when.delivered_ns - f->when.begin_ns;
	uint64_t diff_wait_ns = f->when.gpu_done_ns - f->when.delivered_ns;

	UPA_LOG_D(
	    "Delivered frame %.2fms %s."                                           //
	    "\n\tperiod: %.2f"                                                     //
	    "\n\tcpu  o: %.2f, n: %.2f"                                            //
	    "\n\tdraw o: %.2f, n: %.2f"                                            //
	    "\n\twait o: %.2f, n: %.2f",                                           //
	    time_ns_to_ms_f(diff_ns), late ? "late" : "early",                     //
	    time_ns_to_ms_f(f->predicted_display_period_ns),                       //
	    time_ns_to_ms_f(pa->app.cpu_time_ns), time_ns_to_ms_f(diff_cpu_ns),    //
	    time_ns_to_ms_f(pa->app.draw_time_ns), time_ns_to_ms_f(diff_draw_ns),  //
	    time_ns_to_ms_f(pa->app.wait_time_ns), time_ns_to_ms_f(diff_wait_ns)); //

	do_iir_filter(&pa->app.cpu_time_ns, IIR_ALPHA_LT, IIR_ALPHA_GT, diff_cpu_ns);
	do_iir_filter(&pa->app.draw_time_ns, IIR_ALPHA_LT, IIR_ALPHA_GT, diff_draw_ns);
	do_iir_filter(&pa->app.wait_time_ns, IIR_ALPHA_LT, IIR_ALPHA_GT, diff_wait_ns);

	// Write out metrics and tracing data.
	do_metrics(pa, f, false);
	do_tracing(pa, f);

#ifndef VALIDATE_LATCHED_AND_RETIRED
	// Reset the frame.
	U_ZERO(f); // Zero for metrics
	f->state = U_PA_READY;
	f->frame_id = -1;
#endif
}

static void
pa_latched(struct u_pacing_app *upa, int64_t frame_id, uint64_t when_ns, int64_t system_frame_id)
{
	struct pacing_app *pa = pacing_app(upa);

#ifdef VALIDATE_LATCHED_AND_RETIRED
	size_t index = GET_INDEX_FROM_ID(pa, frame_id);
	struct u_pa_frame *f = &pa->frames[index];
	assert(f->frame_id == frame_id);
	assert(f->state == U_RT_GPU_DONE);
#else
	(void)pa;
#endif

	struct u_metrics_used umu = {
	    .session_id = pa->session_id,
	    .session_frame_id = frame_id,
	    .system_frame_id = system_frame_id,
	    .when_ns = when_ns,
	};

	u_metrics_write_used(&umu);
}

static void
pa_retired(struct u_pacing_app *upa, int64_t frame_id, uint64_t when_ns)
{
	struct pacing_app *pa = pacing_app(upa);

#ifdef VALIDATE_LATCHED_AND_RETIRED
	size_t index = GET_INDEX_FROM_ID(pa, frame_id);
	struct u_pa_frame *f = &pa->frames[index];
	assert(f->frame_id == frame_id);
	assert(f->state == U_RT_GPU_DONE || f->state == U_RT_DELIVERED);

	// Reset the frame.
	U_ZERO(f); // Zero for metrics
	f->state = U_PA_READY;
	f->frame_id = -1;
#else
	(void)pa;
#endif
}

static void
pa_info(struct u_pacing_app *upa,
        uint64_t predicted_display_time_ns,
        uint64_t predicted_display_period_ns,
        uint64_t extra_ns)
{
	struct pacing_app *pa = pacing_app(upa);

	pa->last_input.predicted_display_time_ns = predicted_display_time_ns;
	pa->last_input.predicted_display_period_ns = predicted_display_period_ns;
	pa->last_input.extra_ns = extra_ns;
}

static void
pa_destroy(struct u_pacing_app *upa)
{
	free(upa);
}

static xrt_result_t
pa_create(int64_t session_id, struct u_pacing_app **out_upa)
{
	struct pacing_app *pa = U_TYPED_CALLOC(struct pacing_app);
	pa->base.predict = pa_predict;
	pa->base.mark_point = pa_mark_point;
	pa->base.mark_discarded = pa_mark_discarded;
	pa->base.mark_delivered = pa_mark_delivered;
	pa->base.mark_gpu_done = pa_mark_gpu_done;
	pa->base.latched = pa_latched;
	pa->base.retired = pa_retired;
	pa->base.info = pa_info;
	pa->base.destroy = pa_destroy;
	pa->session_id = session_id;
	pa->app.cpu_time_ns = U_TIME_1MS_IN_NS * 2;
	pa->app.draw_time_ns = U_TIME_1MS_IN_NS * 2;
	pa->app.margin_ns = U_TIME_1MS_IN_NS * 2;

	for (size_t i = 0; i < ARRAY_SIZE(pa->frames); i++) {
		pa->frames[i].state = U_PA_READY;
		pa->frames[i].frame_id = -1;
	}

	*out_upa = &pa->base;

	return XRT_SUCCESS;
}


/*
 *
 * Factory functions.
 *
 */

static xrt_result_t
paf_create(struct u_pacing_app_factory *upaf, struct u_pacing_app **out_upa)
{
	static int64_t session_id_gen = 0; // For now until global session id is introduced.

	return pa_create(session_id_gen++, out_upa);
}

static void
paf_destroy(struct u_pacing_app_factory *upaf)
{
	free(upaf);
}


/*
 *
 * 'Exported' functions.
 *
 */

xrt_result_t
u_pa_factory_create(struct u_pacing_app_factory **out_upaf)
{
	struct u_pacing_app_factory *upaf = U_TYPED_CALLOC(struct u_pacing_app_factory);
	upaf->create = paf_create;
	upaf->destroy = paf_destroy;

	*out_upaf = upaf;

	return XRT_SUCCESS;
}