From 440a60a43bf113b5965f6a9aa56d6ff39bbe87c5 Mon Sep 17 00:00:00 2001
From: psucien <bad_cast@protonmail.com>
Date: Wed, 5 Jun 2024 16:14:22 +0200
Subject: [PATCH] texture_cache: detiler: m8x1 and m8x4 shaders

---
 src/video_core/host_shaders/detile_m8x1.comp  | 48 +++++++++++++++
 src/video_core/host_shaders/detile_m8x4.comp  | 58 +++++++++++++++++++
 src/video_core/texture_cache/tile_manager.cpp | 26 ++++-----
 3 files changed, 118 insertions(+), 14 deletions(-)
 create mode 100644 src/video_core/host_shaders/detile_m8x1.comp
 create mode 100644 src/video_core/host_shaders/detile_m8x4.comp

diff --git a/src/video_core/host_shaders/detile_m8x1.comp b/src/video_core/host_shaders/detile_m8x1.comp
new file mode 100644
index 00000000..1b84b402
--- /dev/null
+++ b/src/video_core/host_shaders/detile_m8x1.comp
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 450
+#extension GL_KHR_shader_subgroup_shuffle : require
+
+// NOTE: Current subgroup utilization is subotimal on most GPUs, so
+// it will be nice to process two tiles at once here.
+layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
+
+layout(std430, binding = 0) buffer input_buf {
+    uint in_data[];
+};
+layout(r8ui, binding = 1) uniform writeonly uimage2D output_img;
+
+layout(push_constant) uniform image_info {
+    uint pitch;
+} info;
+
+#define MICRO_TILE_DIM      8
+#define TEXELS_PER_ELEMENT  4
+
+void main() {
+    uint p0 = in_data[gl_GlobalInvocationID.x];
+    uint p1 = subgroupShuffleXor(p0, 1);
+    uint hword = gl_LocalInvocationID.x & 1;
+    uint dst_tx = (hword == 1)
+        ? (p0 & 0xffff0000) | (p1 >> 16)
+        : (p0 & 0x0000ffff) | (p1 << 16);
+
+    uint col = (gl_LocalInvocationID.x >> 2) & 1;
+    uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT)
+                + TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3);
+
+    uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM;
+    uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
+    uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
+    uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
+    uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
+
+    ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
+
+    #pragma unroll
+    for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
+        imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff));
+        dst_tx >>= 8;
+    }
+}
\ No newline at end of file
diff --git a/src/video_core/host_shaders/detile_m8x4.comp b/src/video_core/host_shaders/detile_m8x4.comp
new file mode 100644
index 00000000..97438fe9
--- /dev/null
+++ b/src/video_core/host_shaders/detile_m8x4.comp
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 450
+#extension GL_KHR_shader_subgroup_shuffle : require
+
+layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout(std430, binding = 0) buffer input_buf {
+    uint in_data[];
+};
+layout(rgba8ui, binding = 1) uniform writeonly uimage2D output_img;
+
+layout(push_constant) uniform image_info {
+    uint pitch;
+} info;
+
+// Inverse morton LUT, small enough to fit into K$
+uint lut_8x4[16] = {
+    0x11011000, 0x31213020,
+    0x13031202, 0x33233222,
+    0x51415040, 0x71617060,
+    0x53435242, 0x73637262,
+
+    0x15051404, 0x35253424,
+    0x17071606, 0x37273626,
+    0x55455444, 0x75657464,
+    0x57475646, 0x77677666,
+};
+
+#define MICRO_TILE_DIM      8
+#define TEXELS_PER_ELEMENT  1
+
+void main() {
+    uint src_tx = in_data[gl_GlobalInvocationID.x];
+
+    uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
+    uint packed_pos = lut_8x4[gl_LocalInvocationID.x >> 2] >> bit_ofs;
+    uint col = bitfieldExtract(packed_pos, 4, 4);
+    uint row = bitfieldExtract(packed_pos, 0, 4);
+
+    uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM;
+    uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
+    uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
+
+    uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
+    uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
+
+    ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
+
+    uvec4 dst_tx = uvec4(
+        bitfieldExtract(src_tx,  0, 8),
+        bitfieldExtract(src_tx,  8, 8),
+        bitfieldExtract(src_tx, 16, 8),
+        bitfieldExtract(src_tx, 24, 8)
+    );
+    imageStore(output_img, img_pos, dst_tx);
+}
\ No newline at end of file
diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp
index d33427db..e36c40f4 100644
--- a/src/video_core/texture_cache/tile_manager.cpp
+++ b/src/video_core/texture_cache/tile_manager.cpp
@@ -4,10 +4,14 @@
 #include "boost/container/static_vector.hpp"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/renderer_vulkan/vk_shader_util.h"
 #include "video_core/texture_cache/image_view.h"
 #include "video_core/texture_cache/texture_cache.h"
 #include "video_core/texture_cache/tile_manager.h"
 
+#include "video_core/host_shaders/detile_m8x1_comp.h"
+#include "video_core/host_shaders/detile_m8x4_comp.h"
+
 #include <vulkan/vulkan_to_string.hpp>
 
 namespace VideoCore {
@@ -201,26 +205,20 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc
                          TextureCache& texture_cache, Vulkan::StreamBuffer& staging)
     : instance{instance}, scheduler{scheduler}, texture_cache{texture_cache}, staging{staging} {
 
+    static const std::array detiler_shaders{
+        HostShaders::DETILE_M8X1_COMP,
+        HostShaders::DETILE_M8X4_COMP,
+    };
+
     for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) {
         auto& ctx = detilers[pl_id];
 
-        const std::vector<u32> shader_code{};
-
-        const vk::ShaderModuleCreateInfo shader_info = {
-            .codeSize = shader_code.size(),
-            .pCode = shader_code.data(),
-        };
-
-        vk::UniqueShaderModule module;
-        try {
-            module = instance.GetDevice().createShaderModuleUnique(shader_info);
-        } catch (vk::SystemError& err) {
-            UNREACHABLE_MSG("{}", err.what());
-        }
+        const auto& module = Vulkan::Compile(
+            detiler_shaders[pl_id], vk::ShaderStageFlagBits::eCompute, instance.GetDevice());
 
         const vk::PipelineShaderStageCreateInfo shader_ci = {
             .stage = vk::ShaderStageFlagBits::eCompute,
-            .module = *module,
+            .module = module,
             .pName = "main",
         };