diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index 8c3122b2..1fb129f6 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -80,6 +80,7 @@ void CFG::EmitLabels() { if (inst.IsUnconditionalBranch()) { const u32 target = inst.BranchTarget(pc); AddLabel(target); + AddLabel(pc + inst.length); } else if (inst.IsConditionalBranch()) { const u32 true_label = inst.BranchTarget(pc); const u32 false_label = pc + inst.length; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 8db2d63c..820903ab 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -573,21 +573,21 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spansrc_sel == DmaDataSrc::Memory && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, - dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Memory) { rasterizer->InlineData(dma_data->DstAddress(), &dma_data->data, sizeof(u32), false); } else if (dma_data->src_sel == DmaDataSrc::Gds && dma_data->dst_sel == DmaDataDst::Memory) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if (dma_data->src_sel == DmaDataSrc::Memory && dma_data->dst_sel == DmaDataDst::Memory) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), - dma_data->NumBytes(), false); + rasterizer->CopyBuffer(dma_data->DstAddress(), + dma_data->SrcAddress(), dma_data->NumBytes(), + false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -731,20 +731,20 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { rasterizer->InlineData(dma_data->dst_addr_lo, &dma_data->data, sizeof(u32), true); } else if (dma_data->src_sel == DmaDataSrc::Memory && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Memory) { rasterizer->InlineData(dma_data->DstAddress(), &dma_data->data, sizeof(u32), false); } else if (dma_data->src_sel == DmaDataSrc::Gds && dma_data->dst_sel == DmaDataDst::Memory) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if (dma_data->src_sel == DmaDataSrc::Memory && dma_data->dst_sel == DmaDataDst::Memory) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), dma_data->NumBytes(), - false); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->SrcAddress(), + dma_data->NumBytes(), false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index e9fc0649..31b2a2c5 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -312,8 +312,23 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo const BufferId buffer_id = FindBuffer(address, num_bytes); return &slot_buffers[buffer_id]; }(); - const vk::BufferMemoryBarrier2 buf_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + const vk::BufferMemoryBarrier2 buf_barrier_before = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer->Handle(), + .offset = buffer->Offset(address), + .size = num_bytes, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &buf_barrier_before, + }); + cmdbuf.updateBuffer(buffer->Handle(), buffer->Offset(address), num_bytes, value); + const vk::BufferMemoryBarrier2 buf_barrier_after = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, @@ -324,9 +339,96 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &buf_barrier, + .pBufferMemoryBarriers = &buf_barrier_after, + }); +} + +void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + if (!dst_gds && !IsRegionRegistered(dst, num_bytes)) { + if (!src_gds && !IsRegionRegistered(src, num_bytes)) { + // Both buffers were not transferred to GPU yet. Can safely copy in host memory. + memcpy(std::bit_cast(dst), std::bit_cast(src), num_bytes); + return; + } + // Without a readback there's nothing we can do with this + // Fallback to creating dst buffer on GPU to at least have this data there + } + if (!src_gds && !IsRegionRegistered(src, num_bytes)) { + InlineData(dst, std::bit_cast(src), num_bytes, dst_gds); + return; + } + auto& src_buffer = [&] -> const Buffer& { + if (src_gds) { + return gds_buffer; + } + const BufferId buffer_id = FindBuffer(src, num_bytes); + return slot_buffers[buffer_id]; + }(); + auto& dst_buffer = [&] -> const Buffer& { + if (dst_gds) { + return gds_buffer; + } + const BufferId buffer_id = FindBuffer(dst, num_bytes); + return slot_buffers[buffer_id]; + }(); + vk::BufferCopy region{ + .srcOffset = src_buffer.Offset(src), + .dstOffset = dst_buffer.Offset(dst), + .size = num_bytes, + }; + const vk::BufferMemoryBarrier2 buf_barriers_before[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = dst_buffer.Handle(), + .offset = dst_buffer.Offset(dst), + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferRead, + .buffer = src_buffer.Handle(), + .offset = src_buffer.Offset(src), + .size = num_bytes, + }, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_before, + }); + cmdbuf.copyBuffer(src_buffer.Handle(), dst_buffer.Handle(), region); + const vk::BufferMemoryBarrier2 buf_barriers_after[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + .buffer = dst_buffer.Handle(), + .offset = dst_buffer.Offset(dst), + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .buffer = src_buffer.Handle(), + .offset = src_buffer.Offset(src), + .size = num_bytes, + }, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_after, }); - cmdbuf.updateBuffer(buffer->Handle(), buf_barrier.offset, num_bytes, value); } std::pair BufferCache::ObtainHostUBO(std::span data) { @@ -701,8 +803,22 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, scheduler.EndRendering(); image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); const auto cmdbuf = scheduler.CommandBuffer(); + static constexpr vk::MemoryBarrier READ_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eMemoryWrite, + .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite, + }; + static constexpr vk::MemoryBarrier WRITE_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, + }; + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, + vk::PipelineStageFlagBits::eTransfer, + vk::DependencyFlagBits::eByRegion, READ_BARRIER, {}, {}); cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer, copies); + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, + vk::PipelineStageFlagBits::eTransfer, + vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {}); } return true; } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index e6291341..4c57e9c2 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -87,6 +87,7 @@ public: /// Writes a value to GPU buffer. void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); [[nodiscard]] std::pair ObtainHostUBO(std::span data); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index fef4c7ec..9e9b40ca 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -840,6 +840,10 @@ void Rasterizer::InlineData(VAddr address, const void* value, u32 num_bytes, boo buffer_cache.InlineData(address, value, num_bytes, is_gds); } +void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds); +} + u32 Rasterizer::ReadDataFromGds(u32 gds_offset) { auto* gds_buf = buffer_cache.GetGdsBuffer(); u32 value; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index ec1b5e13..b5bead69 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -53,6 +53,7 @@ public: void ScopedMarkerInsertColor(const std::string_view& str, const u32 color); void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); u32 ReadDataFromGds(u32 gsd_offset); bool InvalidateMemory(VAddr addr, u64 size); bool IsMapped(VAddr addr, u64 size);