diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 42ff482a..f9b948c3 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -3,7 +3,9 @@ set(SHADER_FILES detile_m8x1.comp - detile_m8x4.comp + detile_m32x1.comp + detile_m32x2.comp + detile_m32x4.comp ) set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) diff --git a/src/video_core/host_shaders/detile_m8x4.comp b/src/video_core/host_shaders/detile_m32x1.comp similarity index 69% rename from src/video_core/host_shaders/detile_m8x4.comp rename to src/video_core/host_shaders/detile_m32x1.comp index 25f7fef6..f3e84c75 100644 --- a/src/video_core/host_shaders/detile_m8x4.comp +++ b/src/video_core/host_shaders/detile_m32x1.comp @@ -8,14 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; layout(std430, binding = 0) buffer input_buf { uint in_data[]; }; -layout(rgba8ui, binding = 1) uniform writeonly uimage2D output_img; +layout(r32ui, binding = 1) uniform writeonly uimage2D output_img; layout(push_constant) uniform image_info { uint pitch; } info; // Inverse morton LUT, small enough to fit into K$ -uint lut_8x4[16] = { +uint rmort[16] = { 0x11011000, 0x31213020, 0x13031202, 0x33233222, 0x51415040, 0x71617060, @@ -27,17 +27,17 @@ uint lut_8x4[16] = { 0x57475646, 0x77677666, }; -#define MICRO_TILE_DIM 8 -#define TEXELS_PER_ELEMENT 1 +#define MICRO_TILE_DIM (8) +#define TEXELS_PER_ELEMENT (1) void main() { - uint src_tx = in_data[gl_GlobalInvocationID.x]; - uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); - uint packed_pos = lut_8x4[gl_LocalInvocationID.x >> 2] >> bit_ofs; + uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs; uint col = bitfieldExtract(packed_pos, 4, 4); uint row = bitfieldExtract(packed_pos, 0, 4); + uint p0 = in_data[gl_GlobalInvocationID.x]; + uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM) uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; @@ -46,12 +46,5 @@ void main() { uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); - - uvec4 dst_tx = uvec4( - bitfieldExtract(src_tx, 0, 8), - bitfieldExtract(src_tx, 8, 8), - bitfieldExtract(src_tx, 16, 8), - bitfieldExtract(src_tx, 24, 8) - ); - imageStore(output_img, img_pos, dst_tx); + imageStore(output_img, img_pos, uvec4(p0, 0, 0, 0)); } \ No newline at end of file diff --git a/src/video_core/host_shaders/detile_m32x2.comp b/src/video_core/host_shaders/detile_m32x2.comp new file mode 100644 index 00000000..2853f8b7 --- /dev/null +++ b/src/video_core/host_shaders/detile_m32x2.comp @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 + +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint in_data[]; +}; +layout(rg32ui, binding = 1) uniform writeonly uimage2D output_img; + +layout(push_constant) uniform image_info { + uint pitch; +} info; + +// Inverse morton LUT, small enough to fit into K$ +uint rmort[16] = { + 0x11011000, 0x31213020, + 0x13031202, 0x33233222, + 0x51415040, 0x71617060, + 0x53435242, 0x73637262, + + 0x15051404, 0x35253424, + 0x17071606, 0x37273626, + 0x55455444, 0x75657464, + 0x57475646, 0x77677666, +}; + +#define MICRO_TILE_DIM (8) + +void main() { + uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); + uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs; + uint col = bitfieldExtract(packed_pos, 4, 4); + uint row = bitfieldExtract(packed_pos, 0, 4); + + uint block_ofs = 2 * gl_GlobalInvocationID.x; + uint p0 = in_data[block_ofs + 0]; + uint p1 = in_data[block_ofs + 1]; + + uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4 + ivec2 img_pos = MICRO_TILE_DIM * ivec2( + gl_WorkGroupID.x % tiles_per_pitch, + gl_WorkGroupID.x / tiles_per_pitch + ); + imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, 0, 0)); +} \ No newline at end of file diff --git a/src/video_core/host_shaders/detile_m32x4.comp b/src/video_core/host_shaders/detile_m32x4.comp new file mode 100644 index 00000000..64f34e6f --- /dev/null +++ b/src/video_core/host_shaders/detile_m32x4.comp @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 + +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint in_data[]; +}; +layout(rgba32ui, binding = 1) uniform writeonly uimage2D output_img; + +layout(push_constant) uniform image_info { + uint pitch; +} info; + +// Inverse morton LUT, small enough to fit into K$ +uint rmort[16] = { + 0x11011000, 0x31213020, + 0x13031202, 0x33233222, + 0x51415040, 0x71617060, + 0x53435242, 0x73637262, + + 0x15051404, 0x35253424, + 0x17071606, 0x37273626, + 0x55455444, 0x75657464, + 0x57475646, 0x77677666, +}; + +#define MICRO_TILE_DIM (8) + +void main() { + uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); + uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs; + uint col = bitfieldExtract(packed_pos, 4, 4); + uint row = bitfieldExtract(packed_pos, 0, 4); + + uint block_ofs = 4 * gl_GlobalInvocationID.x; + uint p0 = in_data[block_ofs + 0]; + uint p1 = in_data[block_ofs + 1]; + uint p2 = in_data[block_ofs + 2]; + uint p3 = in_data[block_ofs + 3]; + + uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4 + ivec2 img_pos = MICRO_TILE_DIM * ivec2( + gl_WorkGroupID.x % tiles_per_pitch, + gl_WorkGroupID.x / tiles_per_pitch + ); + imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, p2, p3)); +} \ No newline at end of file diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index fc11e6cb..b464f3d7 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -33,8 +33,32 @@ static vk::Format ConvertPixelFormat(const VideoOutFormat format) { return {}; } -static bool IsDepthStencilFormat(vk::Format format) { - switch (format) { +bool ImageInfo::IsBlockCoded() const { + switch (pixel_format) { + case vk::Format::eBc1RgbaSrgbBlock: + case vk::Format::eBc1RgbaUnormBlock: + case vk::Format::eBc1RgbSrgbBlock: + case vk::Format::eBc1RgbUnormBlock: + case vk::Format::eBc2SrgbBlock: + case vk::Format::eBc2UnormBlock: + case vk::Format::eBc3SrgbBlock: + case vk::Format::eBc3UnormBlock: + case vk::Format::eBc4SnormBlock: + case vk::Format::eBc4UnormBlock: + case vk::Format::eBc5SnormBlock: + case vk::Format::eBc5UnormBlock: + case vk::Format::eBc6HSfloatBlock: + case vk::Format::eBc6HUfloatBlock: + case vk::Format::eBc7SrgbBlock: + case vk::Format::eBc7UnormBlock: + return true; + default: + return false; + } +} + +bool ImageInfo::IsDepthStencil() const { + switch (pixel_format) { case vk::Format::eD16Unorm: case vk::Format::eD16UnormS8Uint: case vk::Format::eD32Sfloat: @@ -45,17 +69,20 @@ static bool IsDepthStencilFormat(vk::Format format) { } } -static vk::ImageUsageFlags ImageUsageFlags(const vk::Format format) { +static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; - if (IsDepthStencilFormat(format)) { + if (info.IsDepthStencil()) { usage |= vk::ImageUsageFlagBits::eDepthStencilAttachment; } else { - if (format != vk::Format::eBc3SrgbBlock) { + if (!info.IsBlockCoded()) { usage |= vk::ImageUsageFlagBits::eColorAttachment; } } + if (info.is_tiled || info.is_storage) { + usage |= vk::ImageUsageFlagBits::eStorage; + } return usage; } @@ -179,15 +206,12 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, } if (info.is_tiled) { flags |= vk::ImageCreateFlagBits::eExtendedUsage; - if (false) { // IsBlockCodedFormat() + if (info.IsBlockCoded()) { flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible; } } - info.usage = ImageUsageFlags(info.pixel_format); - if (info.is_tiled || info.is_storage) { - info.usage |= vk::ImageUsageFlagBits::eStorage; - } + info.usage = ImageUsageFlags(info); if (info.pixel_format == vk::Format::eD32Sfloat) { aspect_mask = vk::ImageAspectFlagBits::eDepth; diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index 64bcfbd3..b42a2bb1 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -42,6 +42,9 @@ struct ImageInfo { const AmdGpu::Liverpool::CbDbExtent& hint = {}) noexcept; explicit ImageInfo(const AmdGpu::Image& image) noexcept; + bool IsBlockCoded() const; + bool IsDepthStencil() const; + bool is_tiled = false; bool is_storage = false; vk::Format pixel_format = vk::Format::eUndefined; diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 52fb28a4..1d3e5e21 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -84,7 +84,7 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info .baseMipLevel = 0U, .levelCount = 1, .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, + .layerCount = image.info.IsBlockCoded() ? 1 : VK_REMAINING_ARRAY_LAYERS, }, }; image_view = instance.GetDevice().createImageViewUnique(image_view_ci); diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 3fff9c11..36e1d1e1 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -8,8 +8,10 @@ #include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" +#include "video_core/host_shaders/detile_m32x1_comp.h" +#include "video_core/host_shaders/detile_m32x2_comp.h" +#include "video_core/host_shaders/detile_m32x4_comp.h" #include "video_core/host_shaders/detile_m8x1_comp.h" -#include "video_core/host_shaders/detile_m8x4_comp.h" #include #include @@ -174,27 +176,39 @@ void ConvertTileToLinear(u8* dst, const u8* src, u32 width, u32 height, bool is_ vk::Format DemoteImageFormatForDetiling(vk::Format format) { switch (format) { - case vk::Format::eB8G8R8A8Srgb: - case vk::Format::eR8G8B8A8Unorm: - return vk::Format::eR8G8B8A8Uint; case vk::Format::eR8Unorm: return vk::Format::eR8Uint; + case vk::Format::eB8G8R8A8Srgb: + [[fallthrough]]; + case vk::Format::eR8G8B8A8Unorm: + return vk::Format::eR32Uint; + case vk::Format::eBc1RgbaUnormBlock: + return vk::Format::eR32G32Uint; + case vk::Format::eBc3SrgbBlock: + [[fallthrough]]; + case vk::Format::eBc3UnormBlock: + return vk::Format::eR32G32B32A32Uint; default: - LOG_ERROR(Render_Vulkan, "Unexpected format for demotion {}", vk::to_string(format)); break; } + LOG_ERROR(Render_Vulkan, "Unexpected format for demotion {}", vk::to_string(format)); return format; } const DetilerContext* TileManager::GetDetiler(const Image& image) const { const auto format = DemoteImageFormatForDetiling(image.info.pixel_format); - if (image.info.tiling_mode == AmdGpu::TilingMode::Texture_MicroTiled) { + if (image.info.tiling_mode == AmdGpu::TilingMode::Texture_MicroTiled || + image.info.tiling_mode == AmdGpu::TilingMode::Depth_MicroTiled) { switch (format) { case vk::Format::eR8Uint: return &detilers[DetilerType::Micro8x1]; - case vk::Format::eR8G8B8A8Uint: - return &detilers[DetilerType::Micro8x4]; + case vk::Format::eR32Uint: + return &detilers[DetilerType::Micro32x1]; + case vk::Format::eR32G32Uint: + return &detilers[DetilerType::Micro32x2]; + case vk::Format::eR32G32B32A32Uint: + return &detilers[DetilerType::Micro32x4]; default: return nullptr; } @@ -211,7 +225,9 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc static const std::array detiler_shaders{ HostShaders::DETILE_M8X1_COMP, - HostShaders::DETILE_M8X4_COMP, + HostShaders::DETILE_M32X1_COMP, + HostShaders::DETILE_M32X2_COMP, + HostShaders::DETILE_M32X4_COMP, }; for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) { diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h index c630004c..3a74de22 100644 --- a/src/video_core/texture_cache/tile_manager.h +++ b/src/video_core/texture_cache/tile_manager.h @@ -19,7 +19,9 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format); enum DetilerType : u32 { Micro8x1, - Micro8x4, + Micro32x1, + Micro32x2, + Micro32x4, Max };