diff --git a/src/core/libraries/error_codes.h b/src/core/libraries/error_codes.h index e4aac622e..321a9fbaf 100644 --- a/src/core/libraries/error_codes.h +++ b/src/core/libraries/error_codes.h @@ -252,6 +252,7 @@ constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_QUEUE_ID = 0x80D17001; constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_RING_BASE_ADDR = 0x80D17003; constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_RING_SIZE = 0x80D17002; constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_READ_PTR_ADDR = 0x80D17004; +constexpr int ORBIS_GNM_ERROR_FAILURE = 0x8EEE00FF; // Generic constexpr int ORBIS_OK = 0x00000000; diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 05ae882f1..3da41ccaf 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -31,6 +31,7 @@ static constexpr bool g_fair_hw_init = false; // In case if `submitDone` is issued we need to block submissions until GPU idle static u32 submission_lock{}; +static std::mutex m_submission{}; static u64 frames_submitted{}; // frame counter struct AscQueueInfo { @@ -211,9 +212,32 @@ int PS4_SYSV_ABI sceGnmDestroyWorkloadStream() { return ORBIS_OK; } -int PS4_SYSV_ABI sceGnmDingDong() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; +void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) { + LOG_INFO(Lib_GnmDriver, "vqid {}, offset_dw {}", gnm_vqid, next_offs_dw); + + if (gnm_vqid == 0) { + return; + } + + std::unique_lock lock{m_submission}; + if (submission_lock != 0) { + liverpool->WaitGpuIdle(); + + // Suspend logic goes here + + submission_lock = 0; + } + + auto vqid = gnm_vqid - 1; + auto& asc_queue = asc_queues[{vqid}]; + const auto* acb_ptr = reinterpret_cast(asc_queue.map_addr + *asc_queue.read_addr); + const auto acb_size = next_offs_dw ? (next_offs_dw << 2u) - *asc_queue.read_addr + : (asc_queue.ring_size_dw << 2u) - *asc_queue.read_addr; + + liverpool->SubmitAsc(vqid, {acb_ptr, acb_size >> 2u}); + + *asc_queue.read_addr += acb_size; + *asc_queue.read_addr %= asc_queue.ring_size_dw * 4; } int PS4_SYSV_ABI sceGnmDingDongForWorkload() { @@ -764,10 +788,12 @@ int PS4_SYSV_ABI sceGnmMapComputeQueue(u32 pipe_id, u32 queue_id, VAddr ring_bas } auto vqid = asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw); + // We need to offset index as `dingDong` assumes it to be from the range [1..64] + const auto gnm_vqid = vqid.index + 1; LOG_INFO(Lib_GnmDriver, "ASC pipe {} queue {} mapped to vqueue {}", pipe_id, queue_id, - vqid.index); + gnm_vqid); - return vqid.index; + return gnm_vqid; } int PS4_SYSV_ABI sceGnmMapComputeQueueWithPriority(u32 pipe_id, u32 queue_id, VAddr ring_base_addr, @@ -814,14 +840,16 @@ int PS4_SYSV_ABI sceGnmRegisterGnmLiveCallbackConfig() { return ORBIS_OK; } -int PS4_SYSV_ABI sceGnmRegisterOwner() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; +s32 PS4_SYSV_ABI sceGnmRegisterOwner(void* handle, const char* name) { + LOG_TRACE(Lib_GnmDriver, "called"); + return ORBIS_GNM_ERROR_FAILURE; // PA Debug is always disabled in retail FW } -int PS4_SYSV_ABI sceGnmRegisterResource() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; +s32 PS4_SYSV_ABI sceGnmRegisterResource(void* res_handle, void* owner_handle, const void* addr, + size_t size, const char* name, int res_type, + u64 user_data) { + LOG_TRACE(Lib_GnmDriver, "called"); + return ORBIS_GNM_ERROR_FAILURE; // PA Debug is always disabled in retail FW } int PS4_SYSV_ABI sceGnmRequestFlipAndSubmitDone() { diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index ce8944f59..e37ab81b2 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -33,7 +33,7 @@ int PS4_SYSV_ABI sceGnmDebuggerWriteSqIndirectRegister(); int PS4_SYSV_ABI sceGnmDebugHardwareStatus(); s32 PS4_SYSV_ABI sceGnmDeleteEqEvent(SceKernelEqueue eq, u64 id); int PS4_SYSV_ABI sceGnmDestroyWorkloadStream(); -int PS4_SYSV_ABI sceGnmDingDong(); +void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw); int PS4_SYSV_ABI sceGnmDingDongForWorkload(); int PS4_SYSV_ABI sceGnmDisableMipStatsReport(); s32 PS4_SYSV_ABI sceGnmDispatchDirect(u32* cmdbuf, u32 size, u32 threads_x, u32 threads_y, @@ -125,8 +125,9 @@ int PS4_SYSV_ABI sceGnmQueryResourceRegistrationUserMemoryRequirements(); int PS4_SYSV_ABI sceGnmRaiseUserExceptionEvent(); int PS4_SYSV_ABI sceGnmRegisterGdsResource(); int PS4_SYSV_ABI sceGnmRegisterGnmLiveCallbackConfig(); -int PS4_SYSV_ABI sceGnmRegisterOwner(); -int PS4_SYSV_ABI sceGnmRegisterResource(); +s32 PS4_SYSV_ABI sceGnmRegisterOwner(void* handle, const char* name); +s32 PS4_SYSV_ABI sceGnmRegisterResource(void* res_handle, void* owner_handle, const void* addr, + size_t size, const char* name, int res_type, u64 user_data); int PS4_SYSV_ABI sceGnmRequestFlipAndSubmitDone(); int PS4_SYSV_ABI sceGnmRequestFlipAndSubmitDoneForWorkload(); int PS4_SYSV_ABI sceGnmRequestMipStatsReportAndReset(); diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index 90ac7260f..0c40f1c48 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -288,6 +288,8 @@ void RegisterLib(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("uquVH4-Du78", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutClose); LIB_FUNCTION("1FZBKy8HeNU", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutGetVblankStatus); + LIB_FUNCTION("kGVLc3htQE8", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, + sceVideoOutGetDeviceCapabilityInfo); // openOrbis appears to have libSceVideoOut_v1 module libSceVideoOut_v1.1 LIB_FUNCTION("Up36PTk687E", "libSceVideoOut", 1, "libSceVideoOut", 1, 1, sceVideoOutOpen); diff --git a/src/shader_recompiler/frontend/fetch_shader.cpp b/src/shader_recompiler/frontend/fetch_shader.cpp index 7f4f50e94..80917e0a3 100644 --- a/src/shader_recompiler/frontend/fetch_shader.cpp +++ b/src/shader_recompiler/frontend/fetch_shader.cpp @@ -72,6 +72,9 @@ std::vector ParseFetchShader(const u32* code) { attrib.sgpr_base = it->base_sgpr; attrib.dword_offset = it->dword_offset; + // Store instance id rate + attrib.instance_data = inst.src[0].code; + // Mark load as used. it->dst_reg = -1; } diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h index 2f8eae12c..14f2bf4d4 100644 --- a/src/shader_recompiler/frontend/fetch_shader.h +++ b/src/shader_recompiler/frontend/fetch_shader.h @@ -9,11 +9,12 @@ namespace Shader::Gcn { struct VertexAttribute { - u8 semantic; ///< Semantic index of the attribute - u8 dest_vgpr; ///< Destination VGPR to load first component. - u8 num_elements; ///< Number of components to load - u8 sgpr_base; ///< SGPR that contains the pointer to the list of vertex V# - u8 dword_offset; ///< The dword offset of the V# that describes this attribute. + u8 semantic; ///< Semantic index of the attribute + u8 dest_vgpr; ///< Destination VGPR to load first component. + u8 num_elements; ///< Number of components to load + u8 sgpr_base; ///< SGPR that contains the pointer to the list of vertex V# + u8 dword_offset; ///< The dword offset of the V# that describes this attribute. + u8 instance_data; ///< Indicates that the buffer will be accessed in instance rate }; std::vector ParseFetchShader(const u32* code); diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 344fdc8bb..2d1679f37 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -194,6 +194,11 @@ void Translator::EmitFetch(const GcnInst& inst) { ir.SetVectorReg(dst_reg++, ir.GetAttribute(attr, i)); } + if (attrib.instance_data == 2 || attrib.instance_data == 3) { + LOG_WARNING(Render_Recompiler, "Unsupported instance step rate = {}", + attrib.instance_data); + } + // Read the V# of the attribute to figure out component number and type. const auto buffer = info.ReadUd(attrib.sgpr_base, attrib.dword_offset); const u32 num_components = AmdGpu::NumComponents(buffer.data_format); @@ -203,6 +208,7 @@ void Translator::EmitFetch(const GcnInst& inst) { .num_components = std::min(attrib.num_elements, num_components), .sgpr_base = attrib.sgpr_base, .dword_offset = attrib.dword_offset, + .instance_step_rate = static_cast(attrib.instance_data), }); } } diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index ca489114d..a1f599ba5 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -270,8 +270,8 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip u32 image_binding = descriptors.Add(ImageResource{ .sgpr_base = tsharp.sgpr_base, .dword_offset = tsharp.dword_offset, - .type = image.type, - .nfmt = static_cast(image.num_format.Value()), + .type = image.GetType(), + .nfmt = static_cast(image.GetNumberFmt()), .is_storage = IsImageStorageInstruction(inst), .is_depth = bool(inst_info.is_depth), }); @@ -293,7 +293,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip // Now that we know the image type, adjust texture coordinate vector. const IR::Inst* body = inst.Arg(1).InstRecursive(); const auto [coords, arg] = [&] -> std::pair { - switch (image.type) { + switch (image.GetType()) { case AmdGpu::ImageType::Color1D: return {body->Arg(0), body->Arg(1)}; case AmdGpu::ImageType::Color1DArray: @@ -305,7 +305,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip case AmdGpu::ImageType::Cube: return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)}; default: - UNREACHABLE_MSG("Unknown image type {}", image.type.Value()); + UNREACHABLE_MSG("Unknown image type {}", image.GetType()); } }(); inst.SetArg(1, coords); diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 634749585..196fad013 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -72,11 +72,19 @@ using SamplerResourceList = boost::container::static_vector; struct Info { struct VsInput { + enum InstanceIdType : u8 { + None = 0, + OverStepRate0 = 1, + OverStepRate1 = 2, + Plain = 3, + }; + AmdGpu::NumberFormat fmt; u16 binding; u16 num_components; u8 sgpr_base; u8 dword_offset; + InstanceIdType instance_step_rate; }; boost::container::static_vector vs_inputs{}; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 956e65a18..38d274100 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -12,7 +12,7 @@ namespace AmdGpu { static const char* dcb_task_name{"DCB_TASK"}; static const char* ccb_task_name{"CCB_TASK"}; -static const char* asc_task_name{"ACB_TASK"}; +static const char* acb_task_name{"ACB_TASK"}; std::array Liverpool::ConstantEngine::constants_heap; @@ -381,6 +381,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span acb) { + TracyFiberEnter(acb_task_name); + while (!acb.empty()) { const auto* header = reinterpret_cast(acb.data()); const u32 type = header->type; @@ -393,6 +395,69 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb) { const PM4ItOpcode opcode = header->type3.opcode; const auto* it_body = reinterpret_cast(header) + 1; switch (opcode) { + case PM4ItOpcode::Nop: { + const auto* nop = reinterpret_cast(header); + break; + } + case PM4ItOpcode::IndirectBuffer: { + const auto* indirect_buffer = reinterpret_cast(header); + auto task = + ProcessCompute({indirect_buffer->Address(), indirect_buffer->ib_size}); + while (!task.handle.done()) { + task.handle.resume(); + + TracyFiberLeave; + co_yield {}; + TracyFiberEnter(acb_task_name); + }; + break; + } + case PM4ItOpcode::AcquireMem: { + break; + } + case PM4ItOpcode::SetShReg: { + const auto* set_data = reinterpret_cast(header); + std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, + (count - 1) * sizeof(u32)); + break; + } + case PM4ItOpcode::DispatchDirect: { + const auto* dispatch_direct = reinterpret_cast(header); + regs.cs_program.dim_x = dispatch_direct->dim_x; + regs.cs_program.dim_y = dispatch_direct->dim_y; + regs.cs_program.dim_z = dispatch_direct->dim_z; + regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; + if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + rasterizer->DispatchDirect(); + } + break; + } + case PM4ItOpcode::WriteData: { + const auto* write_data = reinterpret_cast(header); + ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); + const u32 data_size = (header->type3.count.Value() - 2) * 4; + if (!write_data->wr_one_addr.Value()) { + std::memcpy(write_data->Address(), write_data->data, data_size); + } else { + UNREACHABLE(); + } + break; + } + case PM4ItOpcode::WaitRegMem: { + const auto* wait_reg_mem = reinterpret_cast(header); + ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); + while (!wait_reg_mem->Test()) { + TracyFiberLeave; + co_yield {}; + TracyFiberEnter(acb_task_name); + } + break; + } + case PM4ItOpcode::ReleaseMem: { + const auto* release_mem = reinterpret_cast(header); + release_mem->SignalFence(Platform::InterruptId::Compute0RelMem); // <--- + break; + } default: UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", static_cast(opcode), count); @@ -401,7 +466,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb) { acb = acb.subspan(header->type3.NumWords() + 1); } - return {}; // Not a coroutine yet + TracyFiberLeave; } void Liverpool::SubmitGfx(std::span dcb, std::span ccb) { diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 4883834f7..a3f6cf639 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -265,6 +265,7 @@ enum class InterruptSelect : u32 { None = 0, IrqOnly = 1, IrqWhenWriteConfirm = 2, + IrqUndocumented = 3, }; struct PM4CmdEventWriteEop { @@ -299,6 +300,9 @@ struct PM4CmdEventWriteEop { void SignalFence() const { switch (data_sel.Value()) { + case DataSelect::None: { + break; + } case DataSelect::Data32Low: { *Address() = DataDWord(); break; @@ -321,6 +325,9 @@ struct PM4CmdEventWriteEop { // No interrupt break; } + case InterruptSelect::IrqOnly: + ASSERT(data_sel == DataSelect::None); + [[fallthrough]]; case InterruptSelect::IrqWhenWriteConfirm: { Platform::IrqC::Instance()->Signal(Platform::InterruptId::GfxEop); break; @@ -559,4 +566,105 @@ struct PM4CmdDrawIndexBase { u32 addr_hi; }; +struct PM4CmdIndirectBuffer { + PM4Type3Header header; + u32 ibase_lo; ///< Indirect buffer base address, must be 4 byte aligned + union { + BitField<0, 16, u32> ibase_hi; ///< Indirect buffer base address + u32 dw1; + }; + union { + BitField<0, 20, u32> ib_size; ///< Indirect buffer size + BitField<20, 1, u32> chain; ///< set to chain to IB allocations + BitField<24, 8, u32> vmid; ///< Virtual memory domain ID for command buffer + u32 dw2; + }; + + template + T* Address() const { + return reinterpret_cast((u64(ibase_hi) << 32u) | ibase_lo); + } +}; + +struct PM4CmdReleaseMem { + PM4Type3Header header; + union { + BitField<0, 6, u32> event_type; ///< Event type written to VGT_EVENT_INITIATOR + BitField<8, 4, u32> event_index; ///< Event index + BitField<12, 1, u32> tcl1_vol_action_ena; + BitField<13, 1, u32> tc_vol_action_ena; + BitField<15, 1, u32> tc_wb_action_ena; + BitField<16, 1, u32> tcl1__action_ena; + BitField<17, 1, u32> tc_action_ena; + BitField<25, 2, u32> cache_policy; ///< Cache Policy setting used for writing fences and + ///< timestamps to the TCL2 + u32 dw1; + }; + union { + BitField<16, 2, u32> dst_sel; ///< destination select + BitField<24, 3, InterruptSelect> int_sel; ///< selects interrupt action for end-of-pipe + BitField<29, 3, DataSelect> data_sel; ///< selects source of data + u32 dw2; + }; + u32 address_lo; ///< low bits of address + u32 address_hi; ///< high bits of address + union { + struct { + u16 gds_index; ///< Byte offset into GDS to copy from + u16 num_dw; ///< Number of DWORDS of GDS to copy + }; + u32 data_lo; ///< value that will be written to memory when event occurs + }; + u32 data_hi; + + template + T* Address() const { + return reinterpret_cast(address_lo | u64(address_hi) << 32); + } + + u32 DataDWord() const { + return data_lo; + } + + u64 DataQWord() const { + return data_lo | u64(data_hi) << 32; + } + + void SignalFence(Platform::InterruptId irq_id) const { + switch (data_sel.Value()) { + case DataSelect::Data32Low: { + *Address() = DataDWord(); + break; + } + case DataSelect::Data64: { + *Address() = DataQWord(); + break; + } + case DataSelect::PerfCounter: { + *Address() = Common::FencedRDTSC(); + break; + } + default: { + UNREACHABLE(); + } + } + + switch (int_sel.Value()) { + case InterruptSelect::None: { + // No interrupt + break; + } + case InterruptSelect::IrqUndocumented: + [[fallthrough]]; + case InterruptSelect::IrqWhenWriteConfirm: { + Platform::IrqC::Instance()->Signal(irq_id); + break; + } + default: { + UNREACHABLE(); + } + } + } +}; + } // namespace AmdGpu diff --git a/src/video_core/amdgpu/pm4_opcodes.h b/src/video_core/amdgpu/pm4_opcodes.h index 1d2ab4316..8922c4ea3 100644 --- a/src/video_core/amdgpu/pm4_opcodes.h +++ b/src/video_core/amdgpu/pm4_opcodes.h @@ -46,6 +46,7 @@ enum class PM4ItOpcode : u32 { EventWrite = 0x46, EventWriteEop = 0x47, EventWriteEos = 0x48, + ReleaseMem = 0x49, PremableCntl = 0x4A, DmaData = 0x50, ContextRegRmw = 0x51, diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index cb15080f3..f464d95ab 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -108,36 +108,39 @@ constexpr std::string_view NameOf(TilingMode type) { } struct Image { - union { - BitField<0, 38, u64> base_address; - BitField<40, 12, u64> min_lod; - BitField<52, 6, u64> data_format; - BitField<58, 4, u64> num_format; - BitField<62, 2, u64> mtype; - }; - union { - BitField<0, 14, u64> width; - BitField<14, 14, u64> height; - BitField<28, 3, u64> perf_modulation; - BitField<31, 1, u64> interlaced; - BitField<32, 3, u64> dst_sel_x; - BitField<35, 3, u64> dst_sel_y; - BitField<38, 3, u64> dst_sel_z; - BitField<41, 3, u64> dst_sel_w; - BitField<44, 4, u64> base_level; - BitField<48, 4, u64> last_level; - BitField<52, 5, u64> tiling_index; - BitField<57, 1, u64> pow2pad; - BitField<58, 1, u64> mtype2; - BitField<59, 1, u64> atc; - BitField<60, 4, ImageType> type; - }; - union { - BitField<0, 13, u64> depth; - BitField<13, 14, u64> pitch; - BitField<32, 13, u64> base_array; - BitField<45, 13, u64> last_array; - }; + u64 base_address : 38; + u64 mtype_l2 : 2; + u64 min_lod : 12; + u64 data_format : 6; + u64 num_format : 4; + u64 mtype : 2; + + u64 width : 14; + u64 height : 14; + u64 perf_modulation : 3; + u64 interlaced : 1; + u64 dst_sel_x : 3; + u64 dst_sel_y : 3; + u64 dst_sel_z : 3; + u64 dst_sel_w : 3; + u64 base_level : 4; + u64 last_level : 4; + u64 tiling_index : 5; + u64 pow2pad : 1; + u64 mtype2 : 1; + u64 atc : 1; + u64 type : 4; + + u64 depth : 13; + u64 pitch : 14; + u64 : 5; + u64 base_array : 13; + u64 last_array : 13; + u64 : 6; + u64 min_lod_warn : 12; + u64 counter_bank_id : 8; + u64 lod_hw_cnt_en : 1; + u64 : 43; VAddr Address() const { return base_address << 8; @@ -148,8 +151,8 @@ struct Image { } u32 NumLayers() const { - u32 slices = type == ImageType::Color3D ? 1 : depth.Value() + 1; - if (type == ImageType::Cube) { + u32 slices = GetType() == ImageType::Color3D ? 1 : depth + 1; + if (GetType() == ImageType::Cube) { slices *= 6; } if (pow2pad) { @@ -159,33 +162,38 @@ struct Image { } u32 NumLevels() const { - if (type == ImageType::Color2DMsaa || type == ImageType::Color2DMsaaArray) { + if (GetType() == ImageType::Color2DMsaa || GetType() == ImageType::Color2DMsaaArray) { return 1; } return last_level + 1; } + ImageType GetType() const noexcept { + return static_cast(type); + } + DataFormat GetDataFmt() const noexcept { - return static_cast(data_format.Value()); + return static_cast(data_format); } NumberFormat GetNumberFmt() const noexcept { - return static_cast(num_format.Value()); + return static_cast(num_format); } - [[nodiscard]] TilingMode GetTilingMode() const { - return static_cast(tiling_index.Value()); + TilingMode GetTilingMode() const { + return static_cast(tiling_index); } - [[nodiscard]] bool IsTiled() const { + bool IsTiled() const { return GetTilingMode() != TilingMode::Display_Linear; } - [[nodiscard]] size_t GetSizeAligned() const { + size_t GetSizeAligned() const { // TODO: Derive this properly from tiling params - return (width + 1) * (height + 1) * NumComponents(GetDataFmt()); + return Pitch() * (height + 1) * NumComponents(GetDataFmt()); } }; +static_assert(sizeof(Image) == 32); // 256bits // 8.2.7. Image Sampler [RDNA 2 Instruction Set Architecture] enum class ClampMode : u64 { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 787fb774a..b3401ec1b 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -53,7 +53,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul bindings.push_back({ .binding = input.binding, .stride = buffer.GetStride(), - .inputRate = vk::VertexInputRate::eVertex, + .inputRate = input.instance_step_rate == Shader::Info::VsInput::None + ? vk::VertexInputRate::eVertex + : vk::VertexInputRate::eInstance, }); } @@ -402,8 +404,11 @@ void GraphicsPipeline::BindVertexBuffers(StreamBuffer& staging) const { // Calculate buffers memory overlaps boost::container::static_vector ranges{}; for (const auto& input : vs_info.vs_inputs) { - const auto& buffer = guest_buffers.emplace_back( - vs_info.ReadUd(input.sgpr_base, input.dword_offset)); + const auto& buffer = vs_info.ReadUd(input.sgpr_base, input.dword_offset); + if (buffer.GetSize() == 0) { + continue; + } + guest_buffers.emplace_back(buffer); ranges.emplace_back(buffer.base_address.Value(), buffer.base_address.Value() + buffer.GetSize()); } diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 62ffdd1cc..e223bbaf1 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -116,11 +116,13 @@ static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group) noexcept { const auto& attrib = group.attrib; is_tiled = attrib.tiling_mode == TilingMode::Tile; + tiling_mode = + is_tiled ? AmdGpu::TilingMode::Display_MacroTiled : AmdGpu::TilingMode::Display_Linear; pixel_format = ConvertPixelFormat(attrib.pixel_format); type = vk::ImageType::e2D; size.width = attrib.width; size.height = attrib.height; - pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) >> 7; + pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) & (~127); const bool is_32bpp = attrib.pixel_format != VideoOutFormat::A16R16G16B16Float; ASSERT(is_32bpp); if (!is_tiled) { @@ -128,11 +130,11 @@ ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group) noe return; } if (Config::isNeoMode()) { - guest_size_bytes = pitch * 128 * ((size.height + 127) & (~127)) * 4; + guest_size_bytes = pitch * ((size.height + 127) & (~127)) * 4; } else { - guest_size_bytes = pitch * 128 * ((size.height + 63) & (~63)) * 4; + guest_size_bytes = pitch * ((size.height + 63) & (~63)) * 4; } - is_vo_surface = true; + usage.vo_buffer = true; } ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, @@ -140,12 +142,14 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, is_tiled = buffer.IsTiled(); tiling_mode = buffer.GetTilingMode(); pixel_format = LiverpoolToVK::SurfaceFormat(buffer.info.format, buffer.NumFormat()); + num_samples = 1 << buffer.attrib.num_fragments_log2; type = vk::ImageType::e2D; size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); size.depth = 1; pitch = size.width; guest_size_bytes = buffer.GetSizeAligned(); + usage.render_target = true; } ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, @@ -153,18 +157,20 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, is_tiled = false; pixel_format = LiverpoolToVK::DepthFormat(buffer.z_info.format, buffer.stencil_info.format); type = vk::ImageType::e2D; + num_samples = 1 << buffer.z_info.num_samples; // spec doesn't say it is a log2 size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); size.depth = 1; pitch = size.width; guest_size_bytes = buffer.GetSizeAligned(); + usage.depth_target = true; } ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { is_tiled = image.IsTiled(); tiling_mode = image.GetTilingMode(); pixel_format = LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); - type = ConvertImageType(image.type); + type = ConvertImageType(image.GetType()); size.width = image.width + 1; size.height = image.height + 1; size.depth = 1; @@ -222,7 +228,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, } } - info.usage = ImageUsageFlags(info); + usage = ImageUsageFlags(info); if (info.pixel_format == vk::Format::eD32Sfloat) { aspect_mask = vk::ImageAspectFlagBits::eDepth; @@ -243,7 +249,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, .mipLevels = static_cast(info.resources.levels), .arrayLayers = static_cast(info.resources.layers), .tiling = vk::ImageTiling::eOptimal, - .usage = info.usage, + .usage = usage, .initialLayout = vk::ImageLayout::eUndefined, }; @@ -296,6 +302,31 @@ void Image::Transit(vk::ImageLayout dst_layout, vk::Flags ds pl_stage = dst_pl_stage; } +void Image::Upload(vk::Buffer buffer, u64 offset) { + Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); + + // Copy to the image. + const vk::BufferImageCopy image_copy = { + .bufferOffset = offset, + .bufferRowLength = info.pitch, + .bufferImageHeight = info.size.height, + .imageSubresource{ + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {info.size.width, info.size.height, 1}, + }; + + const auto cmdbuf = scheduler->CommandBuffer(); + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, image_copy); + + Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); +} + Image::~Image() = default; } // namespace VideoCore diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index f54a796f1..2964379f1 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -42,18 +42,28 @@ struct ImageInfo { const AmdGpu::Liverpool::CbDbExtent& hint = {}) noexcept; explicit ImageInfo(const AmdGpu::Image& image) noexcept; + bool IsTiled() const { + return tiling_mode != AmdGpu::TilingMode::Display_Linear; + } bool IsBlockCoded() const; bool IsPacked() const; bool IsDepthStencil() const; + struct { + u32 texture : 1; + u32 storage : 1; + u32 render_target : 1; + u32 depth_target : 1; + u32 vo_buffer : 1; + } usage; // Usage data tracked during image lifetime + bool is_tiled = false; bool is_storage = false; - bool is_vo_surface = false; vk::Format pixel_format = vk::Format::eUndefined; vk::ImageType type = vk::ImageType::e1D; - vk::ImageUsageFlags usage; SubresourceExtent resources; Extent3D size{1, 1, 1}; + u32 num_samples = 1; u32 pitch = 0; u32 guest_size_bytes = 0; AmdGpu::TilingMode tiling_mode{AmdGpu::TilingMode::Display_Linear}; @@ -117,6 +127,7 @@ struct Image { } void Transit(vk::ImageLayout dst_layout, vk::Flags dst_mask); + void Upload(vk::Buffer buffer, u64 offset); const Vulkan::Instance* instance; Vulkan::Scheduler* scheduler; @@ -131,6 +142,7 @@ struct Image { std::optional view_for_detiler; // Resource state tracking + vk::ImageUsageFlags usage; vk::Flags pl_stage = vk::PipelineStageFlagBits::eAllCommands; vk::Flags access_mask = vk::AccessFlagBits::eNone; vk::ImageLayout layout = vk::ImageLayout::eUndefined; diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 240dad7a1..7d6e2960f 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -48,7 +48,7 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) { ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept : is_storage{is_storage} { - type = ConvertImageViewType(image.type); + type = ConvertImageViewType(image.GetType()); format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); range.base.level = 0; range.base.layer = 0; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 526dfa475..441124230 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -151,7 +151,7 @@ ImageView& TextureCache::RegisterImageView(Image& image, const ImageViewInfo& vi // temporary remove its storage bit. std::optional usage_override; if (!image.info.is_storage) { - usage_override = image.info.usage & ~vk::ImageUsageFlagBits::eStorage; + usage_override = image.usage & ~vk::ImageUsageFlagBits::eStorage; } const ImageViewId view_id = slot_image_views.insert(instance, view_info, image, usage_override); @@ -183,7 +183,7 @@ ImageView& TextureCache::RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buff vk::AccessFlagBits::eColorAttachmentWrite | vk::AccessFlagBits::eColorAttachmentRead); - ImageViewInfo view_info{buffer, image.info.is_vo_surface}; + ImageViewInfo view_info{buffer, !!image.info.usage.vo_buffer}; return RegisterImageView(image, view_info); } @@ -210,26 +210,8 @@ void TextureCache::RefreshImage(Image& image) { if (!tile_manager.TryDetile(image)) { // Upload data to the staging buffer. const auto offset = staging.Copy(image.cpu_addr, image.info.guest_size_bytes, 4); - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - // Copy to the image. - const vk::BufferImageCopy image_copy = { - .bufferOffset = offset, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource{ - .aspectMask = vk::ImageAspectFlagBits::eColor, - .mipLevel = 0, - .baseArrayLayer = 0, - .layerCount = 1, - }, - .imageOffset = {0, 0, 0}, - .imageExtent = {image.info.size.width, image.info.size.height, 1}, - }; - - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.copyBufferToImage(staging.Handle(), image.image, - vk::ImageLayout::eTransferDstOptimal, image_copy); + image.Upload(staging.Handle(), offset); } image.Transit(vk::ImageLayout::eGeneral, diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 94f0439c4..b2ff753b8 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -15,7 +15,6 @@ #include #include -#include namespace VideoCore {