From 95fb034b922f2d7e5106a2bd65b46d2f47889533 Mon Sep 17 00:00:00 2001
From: Christoph Haag <christoph.haag@collabora.com>
Date: Thu, 26 May 2022 02:39:59 +0200
Subject: [PATCH] c/render: Implement layer squashing for compute shader

Co-authored-by: Jakob Bornecrantz <jakob@collabora.com>
Co-authored-by: Fernando Velazquez Innella <finnella@magicleap.com>
---
 src/xrt/compositor/CMakeLists.txt            |   1 +
 src/xrt/compositor/render/render_compute.c   | 179 ++++++++++++
 src/xrt/compositor/render/render_interface.h | 104 +++++++
 src/xrt/compositor/render/render_resources.c | 182 +++++++++++-
 src/xrt/compositor/render/render_shaders.c   |   7 +
 src/xrt/compositor/shaders/layer.comp        | 283 +++++++++++++++++++
 6 files changed, 754 insertions(+), 2 deletions(-)
 create mode 100644 src/xrt/compositor/shaders/layer.comp

diff --git a/src/xrt/compositor/CMakeLists.txt b/src/xrt/compositor/CMakeLists.txt
index 5c81f1cdb..df10d350a 100644
--- a/src/xrt/compositor/CMakeLists.txt
+++ b/src/xrt/compositor/CMakeLists.txt
@@ -120,6 +120,7 @@ if(XRT_HAVE_VULKAN)
 	set(SHADERS
 	    shaders/clear.comp
 	    shaders/distortion.comp
+	    shaders/layer.comp
 	    shaders/mesh.frag
 	    shaders/mesh.vert
 	    shaders/layer.frag
diff --git a/src/xrt/compositor/render/render_compute.c b/src/xrt/compositor/render/render_compute.c
index 22c9fac06..17a0b423a 100644
--- a/src/xrt/compositor/render/render_compute.c
+++ b/src/xrt/compositor/render/render_compute.c
@@ -85,6 +85,74 @@ calc_dispatch_dims(const struct render_viewport_data views[2], uint32_t *out_w,
  *
  */
 
+XRT_MAYBE_UNUSED static void
+update_compute_layer_descriptor_set(struct vk_bundle *vk,
+                                    uint32_t src_binding,
+                                    VkSampler src_samplers[COMP_MAX_IMAGES],
+                                    VkImageView src_image_views[COMP_MAX_IMAGES],
+                                    uint32_t image_count,
+                                    uint32_t target_binding,
+                                    VkImageView target_image_view,
+                                    uint32_t ubo_binding,
+                                    VkBuffer ubo_buffer,
+                                    VkDeviceSize ubo_size,
+                                    VkDescriptorSet descriptor_set)
+{
+	assert(image_count <= COMP_MAX_IMAGES);
+
+	VkDescriptorImageInfo src_image_info[COMP_MAX_IMAGES];
+	for (uint32_t i = 0; i < image_count; i++) {
+		src_image_info[i].sampler = src_samplers[i];
+		src_image_info[i].imageView = src_image_views[i];
+		src_image_info[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+	}
+
+	VkDescriptorImageInfo target_image_info = {
+	    .imageView = target_image_view,
+	    .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+	};
+
+	VkDescriptorBufferInfo buffer_info = {
+	    .buffer = ubo_buffer,
+	    .offset = 0,
+	    .range = ubo_size,
+	};
+
+	VkWriteDescriptorSet write_descriptor_sets[3] = {
+	    {
+	        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+	        .dstSet = descriptor_set,
+	        .dstBinding = src_binding,
+	        .descriptorCount = image_count,
+	        .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+	        .pImageInfo = src_image_info,
+	    },
+	    {
+	        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+	        .dstSet = descriptor_set,
+	        .dstBinding = target_binding,
+	        .descriptorCount = 1,
+	        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+	        .pImageInfo = &target_image_info,
+	    },
+	    {
+	        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+	        .dstSet = descriptor_set,
+	        .dstBinding = ubo_binding,
+	        .descriptorCount = 1,
+	        .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+	        .pBufferInfo = &buffer_info,
+	    },
+	};
+
+	vk->vkUpdateDescriptorSets(            //
+	    vk->device,                        //
+	    ARRAY_SIZE(write_descriptor_sets), // descriptorWriteCount
+	    write_descriptor_sets,             // pDescriptorWrites
+	    0,                                 // descriptorCopyCount
+	    NULL);                             // pDescriptorCopies
+}
+
 XRT_MAYBE_UNUSED static void
 update_compute_distortion_descriptor_set(struct vk_bundle *vk,
                                          uint32_t src_binding,
@@ -262,6 +330,12 @@ render_compute_init(struct render_compute *crc, struct render_resources *r)
 	struct vk_bundle *vk = r->vk;
 	crc->r = r;
 
+	C(vk_create_descriptor_set(                 //
+	    vk,                                     //
+	    r->compute.descriptor_pool,             // descriptor_pool
+	    r->compute.layer.descriptor_set_layout, // descriptor_set_layout
+	    &crc->descriptor_set));                 // descriptor_set
+
 	C(vk_create_descriptor_set(                      //
 	    vk,                                          //
 	    r->compute.descriptor_pool,                  // descriptor_pool
@@ -326,6 +400,7 @@ render_compute_close(struct render_compute *crc)
 	struct vk_bundle *vk = vk_from_crc(crc);
 
 	// Reclaimed by vkResetDescriptorPool.
+	crc->descriptor_set = VK_NULL_HANDLE;
 	crc->distortion_descriptor_set = VK_NULL_HANDLE;
 
 	vk->vkResetDescriptorPool(vk->device, crc->r->compute.descriptor_pool, 0);
@@ -333,6 +408,110 @@ render_compute_close(struct render_compute *crc)
 	crc->r = NULL;
 }
 
+void
+render_compute_layers(struct render_compute *crc,
+                      VkSampler src_samplers[COMP_MAX_IMAGES],
+                      VkImageView src_image_views[COMP_MAX_IMAGES],
+                      uint32_t image_count,
+                      VkImage target_image,
+                      VkImageView target_image_view,
+                      VkImageLayout transition_to,
+                      bool timewarp)
+{
+	assert(crc->r != NULL);
+
+	struct vk_bundle *vk = vk_from_crc(crc);
+	struct render_resources *r = crc->r;
+
+	struct render_compute_layer_ubo_data *ubo_data =
+	    (struct render_compute_layer_ubo_data *)crc->r->compute.layer.ubo.mapped;
+
+	/*
+	 * Source, target and distortion images.
+	 */
+
+	VkImageSubresourceRange subresource_range = {
+	    .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+	    .baseMipLevel = 0,
+	    .levelCount = VK_REMAINING_MIP_LEVELS,
+	    .baseArrayLayer = 0,
+	    .layerCount = VK_REMAINING_ARRAY_LAYERS,
+	};
+
+	vk_cmd_image_barrier_gpu_locked( //
+	    vk,                          //
+	    r->cmd,                      //
+	    target_image,                //
+	    0,                           //
+	    VK_ACCESS_SHADER_WRITE_BIT,  //
+	    VK_IMAGE_LAYOUT_UNDEFINED,   //
+	    VK_IMAGE_LAYOUT_GENERAL,     //
+	    subresource_range);          //
+
+	update_compute_layer_descriptor_set( //
+	    vk,                              //
+	    r->compute.src_binding,          //
+	    src_samplers,                    //
+	    src_image_views,                 //
+	    image_count,                     //
+	    r->compute.target_binding,       //
+	    target_image_view,               //
+	    r->compute.ubo_binding,          //
+	    r->compute.layer.ubo.buffer,     //
+	    VK_WHOLE_SIZE,                   //
+	    crc->descriptor_set);            //
+
+	vk->vkCmdBindPipeline(              //
+	    crc->r->cmd,                    // commandBuffer
+	    VK_PIPELINE_BIND_POINT_COMPUTE, // pipelineBindPoint
+	    timewarp ? r->compute.layer.timewarp_pipeline : r->compute.layer.non_timewarp_pipeline); // pipeline
+
+	vk->vkCmdBindDescriptorSets(          //
+	    r->cmd,                           // commandBuffer
+	    VK_PIPELINE_BIND_POINT_COMPUTE,   // pipelineBindPoint
+	    r->compute.layer.pipeline_layout, // layout
+	    0,                                // firstSet
+	    1,                                // descriptorSetCount
+	    &crc->descriptor_set,             // pDescriptorSets
+	    0,                                // dynamicOffsetCount
+	    NULL);                            // pDynamicOffsets
+
+
+	uint32_t w = 0, h = 0;
+	calc_dispatch_dims(ubo_data->views, &w, &h);
+	assert(w != 0 && h != 0);
+
+	vk->vkCmdDispatch( //
+	    r->cmd,        // commandBuffer
+	    w,             // groupCountX
+	    h,             // groupCountY
+	    2);            // groupCountZ
+
+	VkImageMemoryBarrier memoryBarrier = {
+	    .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+	    .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+	    .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT,
+	    .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
+	    .newLayout = transition_to,
+	    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+	    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+	    .image = target_image,
+	    .subresourceRange = subresource_range,
+	};
+
+	vk->vkCmdPipelineBarrier(                 //
+	    r->cmd,                               //
+	    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, //
+	    VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,    //
+	    0,                                    //
+	    0,                                    //
+	    NULL,                                 //
+	    0,                                    //
+	    NULL,                                 //
+	    1,                                    //
+	    &memoryBarrier);                      //
+}
+
 void
 render_compute_projection_timewarp(struct render_compute *crc,
                                    VkSampler src_samplers[2],
diff --git a/src/xrt/compositor/render/render_interface.h b/src/xrt/compositor/render/render_interface.h
index 789239934..b3c6562b7 100644
--- a/src/xrt/compositor/render/render_interface.h
+++ b/src/xrt/compositor/render/render_interface.h
@@ -11,6 +11,9 @@
 #pragma once
 
 #define COMP_MAX_LAYERS 16
+#define COMP_VIEWS_PER_LAYER 2
+#define COMP_MAX_IMAGES 32
+
 #include "xrt/xrt_compiler.h"
 #include "xrt/xrt_defines.h"
 
@@ -76,6 +79,7 @@ render_calc_time_warp_matrix(const struct xrt_pose *src_pose,
 struct render_shaders
 {
 	VkShaderModule clear_comp;
+	VkShaderModule layer_comp;
 	VkShaderModule distortion_comp;
 
 	VkShaderModule mesh_vert;
@@ -308,6 +312,27 @@ struct render_resources
 		//! Default sampler for null images.
 		VkSampler default_sampler;
 
+		struct
+		{
+			//! Descriptor set layout for compute.
+			VkDescriptorSetLayout descriptor_set_layout;
+
+			//! Pipeline layout used for compute distortion.
+			VkPipelineLayout pipeline_layout;
+
+			//! Doesn't depend on target so is static.
+			VkPipeline non_timewarp_pipeline;
+
+			//! Doesn't depend on target so is static.
+			VkPipeline timewarp_pipeline;
+
+			//! Size of combined image sampler array
+			uint32_t image_array_size;
+
+			//! Target info.
+			struct render_buffer ubo;
+		} layer;
+
 		struct
 		{
 			//! Descriptor set layout for compute distortion.
@@ -333,6 +358,8 @@ struct render_resources
 
 			//! Target info.
 			struct render_buffer ubo;
+
+			//! @todo other resources
 		} clear;
 	} compute;
 
@@ -673,9 +700,73 @@ struct render_compute
 	struct render_resources *r;
 
 	//! Shared descriptor set between clear, projection and timewarp.
+	VkDescriptorSet descriptor_set;
+
+	//! Descriptor set for distortion.
 	VkDescriptorSet distortion_descriptor_set;
 };
 
+/*!
+ * UBO data that is sent to the compute layer shaders.
+ *
+ * Used in @ref render_compute
+ */
+struct render_compute_layer_ubo_data
+{
+	struct render_viewport_data views[2];
+	struct xrt_normalized_rect pre_transforms[2];
+	struct xrt_normalized_rect post_transforms[COMP_MAX_LAYERS * COMP_VIEWS_PER_LAYER];
+
+	//! std140 uvec2, corresponds to enum xrt_layer_type and unpremultiplied alpha.
+	struct
+	{
+		uint32_t val;
+		uint32_t unpremultiplied;
+		uint32_t padding[2];
+	} layer_type[COMP_MAX_LAYERS];
+
+	//! Which image/sampler(s) correspond to each layer.
+	struct
+	{
+		uint32_t images[2];
+		//! @todo Implement separated samplers and images (and change to samplers[2])
+		uint32_t padding[2];
+	} images_samplers[COMP_MAX_LAYERS * 2];
+
+
+	/*!
+	 * For projection layers
+	 */
+
+	//! Timewarp matrices
+	struct xrt_matrix_4x4 transforms[COMP_MAX_LAYERS * COMP_VIEWS_PER_LAYER];
+
+
+	/*!
+	 * For quad layers
+	 */
+
+	//! All quad transforms and coordinates are in view space
+	struct
+	{
+		struct xrt_vec3 val;
+		float padding;
+	} quad_position[COMP_MAX_LAYERS * 2];
+	struct
+	{
+		struct xrt_vec3 val;
+		float padding;
+	} quad_normal[COMP_MAX_LAYERS * 2];
+	struct xrt_matrix_4x4 inverse_quad_transform[COMP_MAX_LAYERS * 2];
+
+	//! Quad extent in world scale
+	struct
+	{
+		struct xrt_vec2 val;
+		float padding[2];
+	} quad_extent[COMP_MAX_LAYERS];
+};
+
 /*!
  * UBO data that is sent to the compute distortion shaders.
  *
@@ -723,6 +814,19 @@ render_compute_begin(struct render_compute *crc);
 bool
 render_compute_end(struct render_compute *crc);
 
+/*!
+ * @public @memberof render_compute
+ */
+void
+render_compute_layers(struct render_compute *crc,                   //
+                      VkSampler src_samplers[COMP_MAX_IMAGES],      //
+                      VkImageView src_image_views[COMP_MAX_IMAGES], //
+                      uint32_t image_count,                         //
+                      VkImage target_image,                         //
+                      VkImageView target_image_view,                //
+                      VkImageLayout transition_to,                  //
+                      bool timewarp);                               //
+
 /*!
  * @public @memberof render_compute
  */
diff --git a/src/xrt/compositor/render/render_resources.c b/src/xrt/compositor/render/render_resources.c
index d8b8e92d9..63b074fad 100644
--- a/src/xrt/compositor/render/render_resources.c
+++ b/src/xrt/compositor/render/render_resources.c
@@ -181,6 +181,59 @@ init_mesh_ubo_buffers(struct vk_bundle *vk, struct render_buffer *l_ubo, struct
  *
  */
 
+static VkResult
+create_compute_layer_descriptor_set_layout(struct vk_bundle *vk,
+                                           uint32_t src_binding,
+                                           uint32_t target_binding,
+                                           uint32_t ubo_binding,
+                                           uint32_t source_images_count,
+                                           VkDescriptorSetLayout *out_descriptor_set_layout)
+{
+	VkResult ret;
+
+	VkDescriptorSetLayoutBinding set_layout_bindings[3] = {
+	    {
+	        .binding = src_binding,
+	        .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+	        .descriptorCount = source_images_count,
+	        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+	    },
+	    {
+	        .binding = target_binding,
+	        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+	        .descriptorCount = 1,
+	        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+	    },
+	    {
+	        .binding = ubo_binding,
+	        .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+	        .descriptorCount = 1,
+	        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+	    },
+	};
+
+	VkDescriptorSetLayoutCreateInfo set_layout_info = {
+	    .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+	    .bindingCount = ARRAY_SIZE(set_layout_bindings),
+	    .pBindings = set_layout_bindings,
+	};
+
+	VkDescriptorSetLayout descriptor_set_layout = VK_NULL_HANDLE;
+	ret = vk->vkCreateDescriptorSetLayout( //
+	    vk->device,                        //
+	    &set_layout_info,                  //
+	    NULL,                              //
+	    &descriptor_set_layout);           //
+	if (ret != VK_SUCCESS) {
+		VK_ERROR(vk, "vkCreateDescriptorSetLayout failed: %s", vk_result_string(ret));
+		return ret;
+	}
+
+	*out_descriptor_set_layout = descriptor_set_layout;
+
+	return VK_SUCCESS;
+}
+
 static VkResult
 create_compute_distortion_descriptor_set_layout(struct vk_bundle *vk,
                                                 uint32_t src_binding,
@@ -240,12 +293,61 @@ create_compute_distortion_descriptor_set_layout(struct vk_bundle *vk,
 	return VK_SUCCESS;
 }
 
+struct compute_layer_params
+{
+	VkBool32 do_timewarp;
+	VkBool32 do_color_correction;
+	uint32_t max_layers;
+	uint32_t views_per_layer;
+	uint32_t image_array_size;
+};
+
 struct compute_distortion_params
 {
 	uint32_t distortion_texel_count;
 	VkBool32 do_timewarp;
 };
 
+static VkResult
+create_compute_layer_pipeline(struct vk_bundle *vk,
+                              VkPipelineCache pipeline_cache,
+                              VkShaderModule shader,
+                              VkPipelineLayout pipeline_layout,
+                              const struct compute_layer_params *params,
+                              VkPipeline *out_compute_pipeline)
+{
+#define ENTRY(ID, FIELD)                                                                                               \
+	{                                                                                                              \
+	    .constantID = ID,                                                                                          \
+	    .offset = offsetof(struct compute_layer_params, FIELD),                                                    \
+	    sizeof(params->FIELD),                                                                                     \
+	}
+
+	VkSpecializationMapEntry entries[] = {
+	    ENTRY(1, do_timewarp),         //
+	    ENTRY(2, do_color_correction), //
+	    ENTRY(3, max_layers),          //
+	    ENTRY(4, views_per_layer),     //
+	    ENTRY(5, image_array_size),    //
+	};
+#undef ENTRY
+
+	VkSpecializationInfo specialization_info = {
+	    .mapEntryCount = ARRAY_SIZE(entries),
+	    .pMapEntries = entries,
+	    .dataSize = sizeof(*params),
+	    .pData = params,
+	};
+
+	return vk_create_compute_pipeline( //
+	    vk,                            // vk_bundle
+	    pipeline_cache,                // pipeline_cache
+	    shader,                        // shader
+	    pipeline_layout,               // pipeline_layout
+	    &specialization_info,          // specialization_info
+	    out_compute_pipeline);         // out_compute_pipeline
+}
+
 static VkResult
 create_compute_distortion_pipeline(struct vk_bundle *vk,
                                    VkPipelineCache pipeline_cache,
@@ -705,6 +807,11 @@ render_resources_init(struct render_resources *r,
 	r->compute.target_binding = 2;
 	r->compute.ubo_binding = 3;
 
+	r->compute.layer.image_array_size = vk->features.max_per_stage_descriptor_sampled_images;
+	if (r->compute.layer.image_array_size > COMP_MAX_IMAGES) {
+		r->compute.layer.image_array_size = COMP_MAX_IMAGES;
+	}
+
 
 	/*
 	 * Mock, used as a default image empty image.
@@ -841,12 +948,14 @@ render_resources_init(struct render_resources *r,
 	    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, // clamp_mode
 	    &r->compute.default_sampler));         // out_sampler
 
+
 	struct vk_descriptor_pool_info compute_pool_info = {
 	    .uniform_per_descriptor_count = 1,
-	    .sampler_per_descriptor_count = 8,
+	    // layer images
+	    .sampler_per_descriptor_count = r->compute.layer.image_array_size + 6,
 	    .storage_image_per_descriptor_count = 1,
 	    .storage_buffer_per_descriptor_count = 0,
-	    .descriptor_count = 1,
+	    .descriptor_count = 2,
 	    .freeable = false,
 	};
 
@@ -856,6 +965,68 @@ render_resources_init(struct render_resources *r,
 	    &r->compute.descriptor_pool)); // out_descriptor_pool
 
 
+	/*
+	 * Layer pipeline
+	 */
+
+	C(create_compute_layer_descriptor_set_layout(  //
+	    vk,                                        // vk_bundle
+	    r->compute.src_binding,                    // src_binding,
+	    r->compute.target_binding,                 // target_binding,
+	    r->compute.ubo_binding,                    // ubo_binding,
+	    r->compute.layer.image_array_size,         // source_images_count,
+	    &r->compute.layer.descriptor_set_layout)); // out_descriptor_set_layout
+
+	C(vk_create_pipeline_layout(                //
+	    vk,                                     // vk_bundle
+	    r->compute.layer.descriptor_set_layout, // descriptor_set_layout
+	    &r->compute.layer.pipeline_layout));    // out_pipeline_layout
+
+	struct compute_layer_params layer_params = {
+	    .do_timewarp = false,
+	    .do_color_correction = true,
+	    .max_layers = COMP_MAX_LAYERS,
+	    .views_per_layer = COMP_VIEWS_PER_LAYER,
+	    .image_array_size = r->compute.layer.image_array_size,
+	};
+
+	C(create_compute_layer_pipeline(               //
+	    vk,                                        // vk_bundle
+	    r->pipeline_cache,                         // pipeline_cache
+	    r->shaders->layer_comp,                    // shader
+	    r->compute.layer.pipeline_layout,          // pipeline_layout
+	    &layer_params,                             // params
+	    &r->compute.layer.non_timewarp_pipeline)); // out_compute_pipeline
+
+	struct compute_layer_params layer_timewarp_params = {
+	    .do_timewarp = true,
+	    .do_color_correction = true,
+	    .max_layers = COMP_MAX_LAYERS,
+	    .views_per_layer = COMP_VIEWS_PER_LAYER,
+	    .image_array_size = r->compute.layer.image_array_size,
+	};
+
+	C(create_compute_layer_pipeline(           //
+	    vk,                                    // vk_bundle
+	    r->pipeline_cache,                     // pipeline_cache
+	    r->shaders->layer_comp,                // shader
+	    r->compute.layer.pipeline_layout,      // pipeline_layout
+	    &layer_timewarp_params,                // params
+	    &r->compute.layer.timewarp_pipeline)); // out_compute_pipeline
+
+	size_t layer_ubo_size = sizeof(struct render_compute_layer_ubo_data);
+
+	C(render_buffer_init(        //
+	    vk,                      // vk_bundle
+	    &r->compute.layer.ubo,   // buffer
+	    ubo_usage_flags,         // usage_flags
+	    memory_property_flags,   // memory_property_flags
+	    layer_ubo_size));        // size
+	C(render_buffer_map(         //
+	    vk,                      // vk_bundle
+	    &r->compute.layer.ubo)); // buffer
+
+
 	/*
 	 * Distortion pipeline
 	 */
@@ -1108,6 +1279,12 @@ render_resources_close(struct render_resources *r)
 	render_buffer_close(vk, &r->mesh.ubos[1]);
 
 	D(DescriptorPool, r->compute.descriptor_pool);
+
+	D(DescriptorSetLayout, r->compute.layer.descriptor_set_layout);
+	D(Pipeline, r->compute.layer.non_timewarp_pipeline);
+	D(Pipeline, r->compute.layer.timewarp_pipeline);
+	D(PipelineLayout, r->compute.layer.pipeline_layout);
+
 	D(DescriptorSetLayout, r->compute.distortion.descriptor_set_layout);
 	D(Pipeline, r->compute.distortion.pipeline);
 	D(Pipeline, r->compute.distortion.timewarp_pipeline);
@@ -1119,6 +1296,7 @@ render_resources_close(struct render_resources *r)
 
 	render_distortion_buffer_close(r);
 	render_buffer_close(vk, &r->compute.clear.ubo);
+	render_buffer_close(vk, &r->compute.layer.ubo);
 	render_buffer_close(vk, &r->compute.distortion.ubo);
 
 	teardown_scratch_image(r);
diff --git a/src/xrt/compositor/render/render_shaders.c b/src/xrt/compositor/render/render_shaders.c
index f1d9f7dfb..22b1ddbc3 100644
--- a/src/xrt/compositor/render/render_shaders.c
+++ b/src/xrt/compositor/render/render_shaders.c
@@ -24,6 +24,7 @@
 #include "xrt/xrt_config_build.h"
 
 #include "shaders/clear.comp.h"
+#include "shaders/layer.comp.h"
 #include "shaders/distortion.comp.h"
 #include "shaders/layer.frag.h"
 #include "shaders/layer.vert.h"
@@ -93,6 +94,11 @@ render_shaders_load(struct render_shaders *s, struct vk_bundle *vk)
 	              sizeof(shaders_clear_comp), // size
 	              &s->clear_comp));           // out
 
+	C(shader_load(vk,                         // vk_bundle
+	              shaders_layer_comp,         // data
+	              sizeof(shaders_layer_comp), // size
+	              &s->layer_comp));           // out
+
 	C(shader_load(vk,                              // vk_bundle
 	              shaders_distortion_comp,         // data
 	              sizeof(shaders_distortion_comp), // size
@@ -161,6 +167,7 @@ render_shaders_close(struct render_shaders *s, struct vk_bundle *vk)
 {
 	D(clear_comp);
 	D(distortion_comp);
+	D(layer_comp);
 	D(mesh_vert);
 	D(mesh_frag);
 	D(equirect1_vert);
diff --git a/src/xrt/compositor/shaders/layer.comp b/src/xrt/compositor/shaders/layer.comp
new file mode 100644
index 000000000..a1024ca4a
--- /dev/null
+++ b/src/xrt/compositor/shaders/layer.comp
@@ -0,0 +1,283 @@
+// Copyright 2021-2022, Collabora Ltd.
+// Author: Jakob Bornecrantz <jakob@collabora.com>
+// Author: Christoph Haag <christoph.haag@collabora.com>
+// SPDX-License-Identifier: BSL-1.0
+
+#version 460
+#extension GL_GOOGLE_include_directive : require
+
+#include "srgb.inc.glsl"
+
+//! @todo should this be a spcialization const?
+#define XRT_LAYER_STEREO_PROJECTION 0
+#define XRT_LAYER_STEREO_PROJECTION_DEPTH 1
+#define XRT_LAYER_QUAD 2
+#define XRT_LAYER_CUBE 3
+#define XRT_LAYER_CYLINDER 4
+#define XRT_LAYER_EQUIRECT1 5
+#define XRT_LAYER_EQUIRECT2 6
+
+// Should we do timewarp.
+layout(constant_id = 1) const bool do_timewarp = false;
+layout(constant_id = 2) const bool do_color_correction = true;
+layout(constant_id = 3) const int COMP_MAX_LAYERS = 16;
+layout(constant_id = 4) const int COMP_VIEWS_PER_LAYER = 2;
+layout(constant_id = 5) const int SAMPLER_ARRAY_SIZE = 16;
+
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+
+// layer 0 left color, layer 0 right color, [optional: layer 0 left depth, layer 0 right depth], layer 1 left, layer 1 right, ...
+layout(set = 0, binding = 0) uniform sampler2D source[SAMPLER_ARRAY_SIZE];
+layout(set = 0, binding = 2) uniform writeonly restrict image2D target;
+layout(set = 0, binding = 3, std140) uniform restrict Config
+{
+	ivec4 views[2];
+	vec4 pre_transform[2];
+	vec4 post_transform[COMP_MAX_LAYERS][2];
+
+	// corresponds to enum xrt_layer_type
+	uvec2 layer_type_and_unpremultiplied[COMP_MAX_LAYERS];
+
+	// which image/sampler(s) correspond to each layer
+	ivec2 images_samplers[COMP_MAX_LAYERS][2];
+
+	// for projection layers
+
+	// timewarp matrices
+	mat4 transform[COMP_MAX_LAYERS][2];
+
+
+	// for quad layers
+
+	// all quad transforms and coordinates are in view space
+	vec4 quad_position[COMP_MAX_LAYERS][2];
+	vec4 quad_normal[COMP_MAX_LAYERS][2];
+	mat4 inverse_quad_transform[COMP_MAX_LAYERS][2];
+
+	// quad extent in world scale
+	vec2 quad_extent[COMP_MAX_LAYERS];
+} ubo;
+
+
+vec2 position_to_view_uv(ivec2 extent, uint ix, uint iy)
+{
+	// Turn the index into floating point.
+	vec2 xy = vec2(float(ix), float(iy));
+
+	// The inverse of the extent of a view image is the pixel size in [0 .. 1] space.
+	vec2 extent_pixel_size = vec2(1.0 / float(extent.x), 1.0 / float(extent.y));
+
+	// Per-target pixel we move the size of the pixels.
+	vec2 view_uv = xy * extent_pixel_size;
+
+	// Emulate a triangle sample position by offset half target pixel size.
+	view_uv = view_uv + extent_pixel_size / 2.0;
+
+	return view_uv;
+}
+
+vec2 transform_uv_subimage(vec2 uv, uint iz, uint layer)
+{
+	vec2 values = uv;
+
+	// To deal with OpenGL flip and sub image view.
+	values.xy = values.xy * ubo.post_transform[layer][iz].zw + ubo.post_transform[layer][iz].xy;
+
+	// Ready to be used.
+	return values.xy;
+}
+
+vec2 transform_uv_timewarp(vec2 uv, uint view_index, uint layer)
+{
+	vec4 values = vec4(uv, -1, 1);
+
+	// From uv to tan angle (tanget space).
+	values.xy = values.xy * ubo.pre_transform[view_index].zw + ubo.pre_transform[view_index].xy;
+	values.y = -values.y; // Flip to OpenXR coordinate system.
+
+	// Timewarp.
+	values = ubo.transform[layer][view_index] * values;
+	values.xy = values.xy * (1.0 / max(values.w, 0.00001));
+
+	// From [-1, 1] to [0, 1]
+	values.xy = values.xy * 0.5 + 0.5;
+
+	// To deal with OpenGL flip and sub image view.
+	values.xy = values.xy * ubo.post_transform[layer][view_index].zw + ubo.post_transform[layer][view_index].xy;
+
+	// Done.
+	return values.xy;
+}
+
+vec2 transform_uv(vec2 uv, uint view_index, uint layer)
+{
+	if (do_timewarp) {
+		return transform_uv_timewarp(uv, view_index, layer);
+	} else {
+		return transform_uv_subimage(uv, view_index, layer);
+	}
+}
+
+vec4 do_projection(uint view_index, vec2 view_uv, uint layer)
+{
+	uint source_image_index = ubo.images_samplers[layer][view_index].x;
+
+	// Do any transformation needed.
+	vec2 uv = transform_uv(view_uv, view_index, layer);
+
+	// Sample the source.
+	vec4 colour = vec4(texture(source[source_image_index], uv).rgba);
+
+	return colour;
+}
+
+vec3 get_direction(vec2 uv, uint view_index)
+{
+	// Skip the DIM/STRETCH/OFFSET stuff and go directly to values
+	vec4 values = vec4(uv, -1, 1);
+
+	// From uv to tan angle (tangent space).
+	values.xy = values.xy * ubo.pre_transform[view_index].zw + ubo.pre_transform[view_index].xy;
+	values.y = -values.y; // Flip to OpenXR coordinate system.
+
+	vec3 direction = normalize(values.xyz);
+	return direction;
+}
+
+vec4 do_quad(uint view_index, vec2 view_uv, uint layer)
+{
+	uint source_image_index = ubo.images_samplers[layer][view_index].x;
+
+	// center point of the plane in view space.
+	vec3 quad_position = ubo.quad_position[layer][view_index].xyz;
+
+	// normal vector of the plane.
+	vec3 normal = ubo.quad_normal[layer][view_index].xyz;
+	normal = normalize(normal);
+
+	// coordinate system is the view space, therefore the camera/eye position is in the origin.
+	vec3 camera = vec3(0.0, 0.0, 0.0);
+
+	// default color white should never be visible
+	vec4 colour = vec4(1.0, 1.0, 1.0, 1.0);
+
+	//! @todo can we get better "pixel stuck" on projection layers with timewarp uv?
+	// never use the timewarp uv here because it depends on the projection layer pose
+	vec2 uv = view_uv;
+
+	/*
+	* To fill in the view_uv texel on the target texture, animaginary ray is shot hrough texels on the target
+	* texture. When this imaginary ray hits a quad layer, it means that when the respective color at the hit
+	* intersection is picked for the current view_uv texel, the final image as seen through the headset will
+	* show this view_uv texel at the respective location.
+	*/
+	vec3 direction = get_direction(uv, view_index);
+	direction = normalize(direction);
+
+	float denominator = dot(direction, normal);
+
+	// denominator is negative when vectors point towards each other, 0 when perpendicular,
+	// and positive when vectors point in a similar direction, i.e. direction vector faces quad backface, which we don't render.
+	if (denominator < 0.00001) {
+		// shortest distance between origin and plane defined by normal + quad_position
+		float dist = dot(camera - quad_position, normal);
+
+		// distance between origin and intersection point on the plane.
+		float intersection_dist = (dot(camera, normal) + dist) / -denominator;
+
+		// layer is behind camera as defined by direction vector
+		if (intersection_dist < 0) {
+			colour = vec4(0.0, 0.0, 0.0, 0.0);
+			return colour;
+		}
+
+		vec3 intersection = camera + intersection_dist * direction;
+
+		// ps for "plane space"
+		vec2 intersection_ps = (ubo.inverse_quad_transform[layer][view_index] * vec4(intersection.xyz, 1.0)).xy;
+
+		bool in_plane_bounds =
+			intersection_ps.x >= - ubo.quad_extent[layer].x / 2. && //
+			intersection_ps.x <= ubo.quad_extent[layer].x / 2. && //
+			intersection_ps.y >= - ubo.quad_extent[layer].y / 2. && //
+			intersection_ps.y <= ubo.quad_extent[layer].y / 2.;
+
+		if (in_plane_bounds) {
+			// intersection_ps is in [-quad_extent .. quad_extent]. Transform to  [0 .. quad_extent], then scale to [ 0 .. 1 ] for sampling
+			vec2 plane_uv = (intersection_ps.xy + ubo.quad_extent[layer] / 2.) / ubo.quad_extent[layer];
+
+			// sample on the desired subimage, not the entire texture
+			plane_uv = plane_uv * ubo.post_transform[layer][view_index].zw + ubo.post_transform[layer][view_index].xy;
+
+			colour = texture(source[source_image_index], plane_uv);
+		} else {
+			// intersection on infinite plane outside of plane bounds
+			colour = vec4(0.0, 0.0, 0.0, 0.0);
+			return colour;
+		}
+	} else {
+		// no intersection with front face of infinite plane or perpendicular
+		colour = vec4(0.0, 0.0, 0.0, 0.0);
+		return colour;
+	}
+
+	return vec4(colour);
+}
+
+vec4 do_layers(vec2 view_uv, uint view_index)
+{
+	vec4 accum = vec4(0, 0, 0, 0);
+	for (uint layer = 0; layer < COMP_MAX_LAYERS; layer++) {
+		bool use_layer = false;
+
+		vec4 rgba = vec4(0, 0, 0, 0);
+		switch (ubo.layer_type_and_unpremultiplied[layer].x) {
+			case XRT_LAYER_STEREO_PROJECTION:
+			case XRT_LAYER_STEREO_PROJECTION_DEPTH:
+				rgba = do_projection(view_index, view_uv, layer);
+				use_layer = true;
+				break;
+			case XRT_LAYER_QUAD:
+				rgba = do_quad(view_index, view_uv, layer);
+				use_layer = true;
+				break;
+			default: break;
+			}
+
+		if (use_layer) {
+			if (ubo.layer_type_and_unpremultiplied[layer].y != 0) {
+				// Unpremultipled blend factor of src.a.
+				accum.rgb = mix(accum.rgb, rgba.rgb, rgba.a);
+			} else {
+				// Premultiplied bland foctor of 1.
+				accum.rgb = (accum.rgb * (1 - rgba.a)) + rgba.rgb;
+			}
+		}
+	}
+	return accum;
+}
+
+void main()
+{
+	uint ix = gl_GlobalInvocationID.x;
+	uint iy = gl_GlobalInvocationID.y;
+	uint iz = gl_GlobalInvocationID.z;
+
+	ivec2 offset = ivec2(ubo.views[iz].xy);
+	ivec2 extent = ivec2(ubo.views[iz].zw);
+
+	if (ix >= extent.x || iy >= extent.y) {
+		return;
+	}
+
+	vec2 view_uv = position_to_view_uv(extent, ix, iy);
+
+	vec4 colour = do_layers(view_uv, iz);
+
+	if (do_color_correction) {
+		// Do colour correction here since there are no automatic conversion in hardware available.
+		colour = vec4(from_linear_to_srgb(colour.rgb), 1);
+	}
+
+	imageStore(target, ivec2(offset.x + ix, offset.y + iy), colour);
+}