diff --git a/RSPRecomp/src/rsp_recomp.cpp b/RSPRecomp/src/rsp_recomp.cpp index fe04dfc..e6dfdb3 100644 --- a/RSPRecomp/src/rsp_recomp.cpp +++ b/RSPRecomp/src/rsp_recomp.cpp @@ -1,33 +1,583 @@ +#include +#include +#include +#include +#include +#include #include "rabbitizer.hpp" #include "fmt/format.h" #include "fmt/ostream.h" -int main() { - //rabbitizer::InstructionRsp instr{ 0xE9DD3801, 0x040013E0 }; // suv $v29[0], 0x8($14) - rabbitizer::InstructionRsp instr{ 0xEAF70B84, 0x04001624 }; // ssv $v23[7], 0x8($23) - //rabbitizer::InstructionRsp instr{ 0x4B5E888F, 0x04001414 }; // vmadh $v2, $v17, $v30[2] - bool has_element = false; - int element = 0; +using InstrId = rabbitizer::InstrId::UniqueId; +using Cop0Reg = rabbitizer::Registers::Rsp::Cop0; +constexpr size_t instr_size = sizeof(uint32_t); - fmt::print("{}\n", instr.disassemble(0)); - fmt::print("{}\n", instr.getOpcodeName()); - fmt::print("{}\n", instr.disassembleOperands()); +// Can't use rabbitizer's operand types because we need to be able to provide a register reference or a register index +enum class RspOperand { + None, + Vt, + VtIndex, + Vd, + Vs, + VsIndex, + De, + Rt, + Rs, + Imm7, +}; - if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementhigh)) { - element = instr.GetRsp_elementhigh(); - has_element = true; - } else if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementlow)) { - if (has_element) { - fmt::print(stderr, "Instruction cannot have two element values {}\n", instr.disassemble(0)); - std::exit(EXIT_FAILURE); - } - element = instr.GetRsp_elementlow(); - has_element = true; - } +std::unordered_map> vector_operands{ + // Vt, Rs, Imm + { InstrId::rsp_lbv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_ldv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_lfv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_lhv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_llv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_lpv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_lqv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_lrv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_lsv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_luv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + // { InstrId::rsp_lwv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, // Not in rabbitizer + { InstrId::rsp_sbv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_sdv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_sfv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_shv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_slv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_spv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_sqv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_srv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_ssv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_suv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_swv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_stv, {RspOperand::VtIndex, RspOperand::Rs, RspOperand::Imm7}}, + { InstrId::rsp_ltv, {RspOperand::VtIndex, RspOperand::Rs, RspOperand::Imm7}}, - if (has_element) { - fmt::print("element: 0x{:X}\n", element); - } + // Vd, Vs, Vt + { InstrId::rsp_vabs, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vadd, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vaddc, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vand, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vch, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vcl, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vcr, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_veq, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vge, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vlt, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmacf, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmacu, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmadh, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmadl, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmadm, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmadn, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmrg, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmudh, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmudl, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmudm, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmudn, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vne, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vnor, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vnxor, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vor, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vsub, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vsubc, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmulf, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmulu, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vmulq, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vnand, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vxor, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, + { InstrId::rsp_vsar, {RspOperand::Vd, RspOperand::Vs, RspOperand::None}}, + { InstrId::rsp_vmacq, {RspOperand::Vd, RspOperand::None, RspOperand::None}}, + // { InstrId::rsp_vzero, {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, unused pseudo + { InstrId::rsp_vrndn, {RspOperand::Vd, RspOperand::VsIndex, RspOperand::Vt}}, + { InstrId::rsp_vrndp, {RspOperand::Vd, RspOperand::VsIndex, RspOperand::Vt}}, - return 0; + // Vd, De, Vt + { InstrId::rsp_vmov, {RspOperand::Vd, RspOperand::De, RspOperand::Vt}}, + { InstrId::rsp_vrcp, {RspOperand::Vd, RspOperand::De, RspOperand::Vt}}, + { InstrId::rsp_vrcpl, {RspOperand::Vd, RspOperand::De, RspOperand::Vt}}, + { InstrId::rsp_vrcph, {RspOperand::Vd, RspOperand::De, RspOperand::Vt}}, + { InstrId::rsp_vrsq, {RspOperand::Vd, RspOperand::De, RspOperand::Vt}}, + { InstrId::rsp_vrsql, {RspOperand::Vd, RspOperand::De, RspOperand::Vt}}, + { InstrId::rsp_vrsqh, {RspOperand::Vd, RspOperand::De, RspOperand::Vt}}, + + // Rt, Vs + { InstrId::rsp_mfc2, {RspOperand::Rt, RspOperand::Vs, RspOperand::None}}, + { InstrId::rsp_mtc2, {RspOperand::Rt, RspOperand::Vs, RspOperand::None}}, + + // Nop + { InstrId::rsp_vnop, {RspOperand::None, RspOperand::None, RspOperand::None}} +}; + +std::string_view ctx_gpr_prefix(int reg) { + if (reg != 0) { + return "r"; + } + return ""; +} + +uint32_t expected_c0_reg_value(int cop0_reg) { + switch (static_cast(cop0_reg)) { + case Cop0Reg::RSP_COP0_SP_STATUS: + return 0; // None of the flags in RSP status are set + case Cop0Reg::RSP_COP0_SP_DMA_FULL: + return 0; // Pretend DMAs complete instantly + case Cop0Reg::RSP_COP0_SP_DMA_BUSY: + return 0; // Pretend DMAs complete instantly + case Cop0Reg::RSP_COP0_SP_SEMAPHORE: + return 0; // Always acquire the semaphore + } + fmt::print(stderr, "Unhandled mfc0: {}\n", cop0_reg); + assert(false); + return 0; +} + +std::string_view c0_reg_write_action(int cop0_reg) { + switch (static_cast(cop0_reg)) { + case Cop0Reg::RSP_COP0_SP_SEMAPHORE: + return ""; // Ignore semaphore functionality + case Cop0Reg::RSP_COP0_SP_STATUS: + return ""; // Ignore writes to the status flags since yielding is ignored + case Cop0Reg::RSP_COP0_SP_DRAM_ADDR: + return "SET_DMA_DRAM"; + case Cop0Reg::RSP_COP0_SP_MEM_ADDR: + return "SET_DMA_DMEM"; + case Cop0Reg::RSP_COP0_SP_RD_LEN: + return "DO_DMA_READ"; + case Cop0Reg::RSP_COP0_SP_WR_LEN: + return "DO_DMA_WRITE"; + } + fmt::print(stderr, "Unhandled mtc0: {}\n", cop0_reg); + assert(false); + return ""; +} + +std::optional get_rsp_element(const rabbitizer::InstructionRsp& instr) { + if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementhigh)) { + return instr.GetRsp_elementhigh(); + } else if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementlow) || instr.hasOperand(rabbitizer::OperandType::rsp_vs_index)) { + return instr.GetRsp_elementlow(); + } + + return std::nullopt; +} + +bool rsp_ignores_element(InstrId id) { + return id == InstrId::rsp_vmacq || id == InstrId::rsp_vnop; +} + +struct BranchTargets { + std::unordered_set direct_targets; + std::unordered_set indirect_targets; +}; + +BranchTargets get_branch_targets(const std::vector& instrs) { + BranchTargets ret; + for (const auto& instr : instrs) { + if (instr.isJumpWithAddress() || instr.isBranch()) { + ret.direct_targets.insert(instr.getBranchVramGeneric()); + } + if (instr.doesLink()) { + ret.indirect_targets.insert(instr.getVram() + 2 * instr_size); + } + } + return ret; +} + +bool process_instruction(size_t instr_index, const std::vector& instructions, std::ofstream& output_file, const BranchTargets& branch_targets, bool indent) { + const auto& instr = instructions[instr_index]; + + uint32_t instr_vram = instr.getVram(); + InstrId instr_id = instr.getUniqueId(); + + // Print a label if one exists here + if (branch_targets.direct_targets.contains(instr_vram) || branch_targets.indirect_targets.contains(instr_vram)) { + fmt::print(output_file, "L_{:08X}:\n", instr_vram); + } + + // Output a comment with the original instruction + if (instr.isBranch() || instr_id == InstrId::rsp_j) { + fmt::print(output_file, " // {}\n", instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric()))); + } else if (instr_id == InstrId::rsp_jal) { + fmt::print(output_file, " // {}\n", instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric()))); + } else { + fmt::print(output_file, " // {}\n", instr.disassemble(0)); + } + + auto print_indent = [&]() { + fmt::print(output_file, " "); + }; + + auto print_line = [&](fmt::format_string fmt_str, Ts ...args) { + print_indent(); + fmt::print(output_file, fmt_str, args...); + fmt::print(output_file, ";\n"); + }; + + auto print_branch_condition = [&](fmt::format_string fmt_str, Ts ...args) { + fmt::print(output_file, fmt_str, args...); + fmt::print(output_file, " "); + }; + + auto print_unconditional_branch = [&](fmt::format_string fmt_str, Ts ...args) { + if (instr_index < instructions.size() - 1) { + uint32_t next_vram = instr_vram + 4; + process_instruction(instr_index + 1, instructions, output_file, branch_targets, false); + } + print_indent(); + fmt::print(output_file, fmt_str, args...); + fmt::print(output_file, ";\n"); + }; + + auto print_branch = [&](fmt::format_string fmt_str, Ts ...args) { + fmt::print(output_file, "{{\n "); + if (instr_index < instructions.size() - 1) { + uint32_t next_vram = instr_vram + 4; + process_instruction(instr_index + 1, instructions, output_file, branch_targets, true); + } + fmt::print(output_file, " "); + fmt::print(output_file, fmt_str, args...); + fmt::print(output_file, ";\n }}\n"); + }; + + if (indent) { + print_indent(); + } + + int rd = (int)instr.GetO32_rd(); + int rs = (int)instr.GetO32_rs(); + int base = rs; + int rt = (int)instr.GetO32_rt(); + int sa = (int)instr.Get_sa(); + + int fd = (int)instr.GetO32_fd(); + int fs = (int)instr.GetO32_fs(); + int ft = (int)instr.GetO32_ft(); + + uint16_t imm = instr.Get_immediate(); + + std::string unsigned_imm_string = fmt::format("{:#X}", imm); + std::string signed_imm_string = fmt::format("{:#X}", (int16_t)imm); + + auto rsp_element = get_rsp_element(instr); + + // If this instruction is in the vector operand table then emit the appropriate function call for its implementation + auto operand_find_it = vector_operands.find(instr_id); + if (operand_find_it != vector_operands.end()) { + const auto& operands = operand_find_it->second; + int vd = (int)instr.GetRsp_vd(); + int vs = (int)instr.GetRsp_vs(); + int vt = (int)instr.GetRsp_vt(); + std::string operand_string = ""; + for (RspOperand operand : operands) { + switch (operand) { + case RspOperand::Vt: + operand_string += fmt::format("rsp.vpu.r[{}], ", vt); + break; + case RspOperand::VtIndex: + operand_string += fmt::format("{}, ", vt); + break; + case RspOperand::Vd: + operand_string += fmt::format("rsp.vpu.r[{}], ", vd); + break; + case RspOperand::Vs: + operand_string += fmt::format("rsp.vpu.r[{}], ", vs); + break; + case RspOperand::VsIndex: + operand_string += fmt::format("{}, ", vs); + break; + case RspOperand::De: + operand_string += fmt::format("{}, ", instr.GetRsp_de()); + break; + case RspOperand::Rt: + operand_string += fmt::format("{}{}, ", ctx_gpr_prefix(rt), rt); + break; + case RspOperand::Rs: + operand_string += fmt::format("{}{}, ", ctx_gpr_prefix(rs), rs); + break; + case RspOperand::Imm7: + // Sign extend the 7-bit immediate + operand_string += fmt::format("{:#X}, ", ((int8_t)(imm << 1)) >> 1); + break; + } + } + // Trim the trailing comma off the operands + if (operand_string.size() > 0) { + operand_string = operand_string.substr(0, operand_string.size() - 2); + } + std::string uppercase_name = ""; + std::string lowercase_name = instr.getOpcodeName(); + uppercase_name.reserve(lowercase_name.size() + 1); + for (char c : lowercase_name) { + uppercase_name += std::toupper(c); + } + if (rsp_ignores_element(instr_id)) { + print_line("rsp.{}({})", uppercase_name, operand_string); + } else { + print_line("rsp.{}<{}>({})", uppercase_name, rsp_element.value(), operand_string); + } + } + // Otherwise, implement the instruction directly + else { + switch (instr_id) { + case InstrId::rsp_nop: + fmt::print(output_file, "\n"); + break; + // Arithmetic + case InstrId::rsp_lui: + print_line("{}{} = S32({} << 16)", ctx_gpr_prefix(rt), rt, unsigned_imm_string); + break; + case InstrId::rsp_add: + case InstrId::rsp_addu: + print_line("{}{} = RSP_ADD32({}{}, {}{})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_negu: // pseudo instruction for subu x, 0, y + case InstrId::rsp_sub: + case InstrId::rsp_subu: + print_line("{}{} = RSP_SUB32({}{}, {}{})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_addi: + case InstrId::rsp_addiu: + print_line("{}{} = RSP_ADD32({}{}, {})", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, signed_imm_string); + break; + case InstrId::rsp_and: + print_line("{}{} = {}{} & {}{}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_andi: + print_line("{}{} = {}{} & {}", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, unsigned_imm_string); + break; + case InstrId::rsp_or: + print_line("{}{} = {}{} | {}{}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_ori: + print_line("{}{} = {}{} | {}", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, unsigned_imm_string); + break; + case InstrId::rsp_nor: + print_line("{}{} = ~({}{} | {}{})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_xor: + print_line("{}{} = {}{} ^ {}{}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_xori: + print_line("{}{} = {}{} ^ {}", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, unsigned_imm_string); + break; + case InstrId::rsp_sll: + print_line("{}{} = S32({}{}) << {}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, sa); + break; + case InstrId::rsp_sllv: + print_line("{}{} = S32({}{}) << ({}{} & 31)", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs); + break; + case InstrId::rsp_sra: + print_line("{}{} = S32(RSP_SIGNED({}{}) >> {})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, sa); + break; + case InstrId::rsp_srav: + print_line("{}{} = S32(RSP_SIGNED({}{}) >> ({}{} & 31))", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs); + break; + case InstrId::rsp_srl: + print_line("{}{} = S32(U32({}{}) >> {})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, sa); + break; + case InstrId::rsp_srlv: + print_line("{}{} = S32(U32({}{}) >> ({}{} & 31))", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs); + break; + case InstrId::rsp_slt: + print_line("{}{} = RSP_SIGNED({}{}) < RSP_SIGNED({}{}) ? 1 : 0", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_slti: + print_line("{}{} = RSP_SIGNED({}{}) < {} ? 1 : 0", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, signed_imm_string); + break; + case InstrId::rsp_sltu: + print_line("{}{} = {}{} < {}{} ? 1 : 0", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_sltiu: + print_line("{}{} = {}{} < {} ? 1 : 0", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, signed_imm_string); + break; + // Loads + // TODO ld + case InstrId::rsp_lw: + print_line("{}{} = RSP_MEM_W({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base); + break; + case InstrId::rsp_lh: + print_line("{}{} = RSP_MEM_H({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base); + break; + case InstrId::rsp_lb: + print_line("{}{} = RSP_MEM_B({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base); + break; + case InstrId::rsp_lhu: + print_line("{}{} = RSP_MEM_HU({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base); + break; + case InstrId::rsp_lbu: + print_line("{}{} = RSP_MEM_BU({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base); + break; + // Stores + case InstrId::rsp_sw: + print_line("RSP_MEM_W({}, {}{}) = {}{}", signed_imm_string, ctx_gpr_prefix(base), base, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_sh: + print_line("RSP_MEM_H({}, {}{}) = {}{}", signed_imm_string, ctx_gpr_prefix(base), base, ctx_gpr_prefix(rt), rt); + break; + case InstrId::rsp_sb: + print_line("RSP_MEM_B({}, {}{}) = {}{}", signed_imm_string, ctx_gpr_prefix(base), base, ctx_gpr_prefix(rt), rt); + break; + // Branches + case InstrId::rsp_j: + case InstrId::rsp_b: + print_unconditional_branch("goto L_{:08X}", instr.getBranchVramGeneric()); + break; + case InstrId::rsp_jal: + print_line("{}{} = 0x{:08X}", ctx_gpr_prefix(31), 31, instr_vram + 2 * instr_size); + print_unconditional_branch("goto L_{:08X}", instr.getBranchVramGeneric()); + break; + case InstrId::rsp_jr: + print_line("jump_target = {}{}", ctx_gpr_prefix(rs), rs); + print_unconditional_branch("goto do_indirect_jump"); + break; + case InstrId::rsp_jalr: + print_line("jump_target = {}{}; {}{} = 0x{:8X}", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rd), rd, instr_vram + 2 * instr_size); + print_unconditional_branch("goto do_indirect_jump"); + break; + case InstrId::rsp_bne: + print_indent(); + print_branch_condition("if ({}{} != {}{})", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric()); + break; + case InstrId::rsp_beq: + print_indent(); + print_branch_condition("if ({}{} == {}{})", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric()); + break; + case InstrId::rsp_bgez: + print_indent(); + print_branch_condition("if (RSP_SIGNED({}{}) >= 0)", ctx_gpr_prefix(rs), rs); + print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric()); + break; + case InstrId::rsp_bgtz: + print_indent(); + print_branch_condition("if (RSP_SIGNED({}{}) > 0)", ctx_gpr_prefix(rs), rs); + print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric()); + break; + case InstrId::rsp_blez: + print_indent(); + print_branch_condition("if (RSP_SIGNED({}{}) <= 0)", ctx_gpr_prefix(rs), rs); + print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric()); + break; + case InstrId::rsp_bltz: + print_indent(); + print_branch_condition("if (RSP_SIGNED({}{}) < 0)", ctx_gpr_prefix(rs), rs); + print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric()); + break; + case InstrId::rsp_break: + print_line("return RspExitReason::Broke", instr_vram); + break; + case InstrId::rsp_mfc0: + print_line("{}{} = {}", ctx_gpr_prefix(rt), rt, expected_c0_reg_value(rd)); + break; + case InstrId::rsp_mtc0: + { + std::string_view write_action = c0_reg_write_action(rd); + if (!write_action.empty()) { + print_line("{}({}{})", write_action, ctx_gpr_prefix(rt), rt); \ + } + break; + } + default: + fmt::print(stderr, "Unhandled instruction: {}\n", instr.getOpcodeName()); + assert(false); + return false; + } + } + + return true; +} + +void write_indirect_jumps(std::ofstream& output_file, const BranchTargets& branch_targets) { + fmt::print(output_file, + "do_indirect_jump:\n" + " switch (jump_target) {{ \n"); + for (uint32_t branch_target: branch_targets.indirect_targets) { + fmt::print(output_file, " case 0x{0:08X}: goto L_{0:08X};\n", branch_target); + } + fmt::print(output_file, + " }}\n" + " return RspExitReason::UnhandledJumpTarget;\n"); +} + +// TODO de-hardcode these +constexpr size_t rsp_text_offset = 0xB8BAD0; +constexpr size_t rsp_text_size = 0xAF0; +constexpr size_t rsp_text_address = 0x04001080; +std::string rom_file_path = "../test/oot_mq_debug.z64"; +std::string output_file_path = "../test/rsp/njpgdspMain.cpp"; +std::string output_function_name = "njpgdspMain"; + +#ifdef _MSC_VER +inline uint32_t byteswap(uint32_t val) { + return _byteswap_ulong(val); +} +#else +constexpr uint32_t byteswap(uint32_t val) { + return __builtin_bswap32(val); +} +#endif + +static_assert((rsp_text_size / instr_size) * instr_size == rsp_text_size, "RSP microcode must be a multiple of the instruction size"); + +int main() { + std::array instr_words{}; + { + std::ifstream rom_file{ rom_file_path, std::ios_base::binary }; + + if (!rom_file.good()) { + fmt::print(stderr, "Failed to open rom file\n"); + return EXIT_FAILURE; + } + + rom_file.seekg(rsp_text_offset); + rom_file.read(reinterpret_cast(instr_words.data()), rsp_text_size); + } + + // Disable appropriate pseudo instructions + RabbitizerConfig_Cfg.pseudos.pseudoMove = false; + RabbitizerConfig_Cfg.pseudos.pseudoBeqz = false; + RabbitizerConfig_Cfg.pseudos.pseudoBnez = false; + RabbitizerConfig_Cfg.pseudos.pseudoNot = false; + + // Decode the instruction words into instructions + std::vector instrs{}; + instrs.reserve(instr_words.size()); + uint32_t vram = rsp_text_address; + for (uint32_t instr_word : instr_words) { + const rabbitizer::InstructionRsp& instr = instrs.emplace_back(byteswap(instr_word), vram); + vram += instr_size; + } + + // Collect indirect jump targets (return addresses for linked jumps) + BranchTargets branch_targets = get_branch_targets(instrs); + + // Open output file and write beginning + std::ofstream output_file(output_file_path); + fmt::print(output_file, + "#include \"../src/rsp.h\"\n" + "#include \"../src/rsp_vu_impl.h\"\n" + "RspExitReason {}(uint8_t* rdram) {{\n" + " uint32_t r1 = 0, r2 = 0, r3 = 0, r4 = 0, r5 = 0, r6 = 0, r7 = 0;\n" + " uint32_t r8 = 0, r9 = 0, r10 = 0, r11 = 0, r12 = 0, r13 = 0, r14 = 0, r15 = 0;\n" + " uint32_t r16 = 0, r17 = 0, r18 = 0, r19 = 0, r20 = 0, r21 = 0, r22 = 0, r23 = 0;\n" + " uint32_t r24 = 0, r25 = 0, r26 = 0, r27 = 0, r28 = 0, r29 = 0, r30 = 0, r31 = 0;\n" + " uint32_t dma_dmem_address = 0, dma_dram_address = 0, jump_target = 0;\n" + " RSP rsp{{}};\n" + " r1 = 0xFC0;\n", output_function_name); + // Write each instruction + for (size_t instr_index = 0; instr_index < instrs.size(); instr_index++) { + process_instruction(instr_index, instrs, output_file, branch_targets, false); + } + + // Terminate instruction code with a return to indicate that the microcode has run past its end + fmt::print(output_file, " return RspExitReason::ImemOverrun;\n"); + + // Write the section containing the indirect jump table + write_indirect_jumps(output_file, branch_targets); + + // End the file + fmt::print(output_file, "}}\n"); + return 0; } diff --git a/lib/rabbitizer b/lib/rabbitizer index 54f9976..b9a39f6 160000 --- a/lib/rabbitizer +++ b/lib/rabbitizer @@ -1 +1 @@ -Subproject commit 54f997607c62d8c1c5316ef414adf17f5c060797 +Subproject commit b9a39f6ec0a3ff6690ef2925e6275cf6578602cc diff --git a/src/recompilation.cpp b/src/recompilation.cpp index 95c943b..0cb5bba 100644 --- a/src/recompilation.cpp +++ b/src/recompilation.cpp @@ -960,6 +960,7 @@ bool process_instruction(const RecompPort::Context& context, const RecompPort::F return false; } + // TODO is this used? if (emit_link_branch) { fmt::print(output_file, " after_{}:\n", link_branch_index); } diff --git a/test/RecompTest.vcxproj b/test/RecompTest.vcxproj index cdb109e..9dee895 100644 --- a/test/RecompTest.vcxproj +++ b/test/RecompTest.vcxproj @@ -162,6 +162,7 @@ XCOPY "$(ProjectDir)Lib\SDL2-2.24.0\lib\$(Platform)\SDL2.dll" "$(TargetDir)" /S + diff --git a/test/RecompTest.vcxproj.filters b/test/RecompTest.vcxproj.filters index 4e7083b..7f0a312 100644 --- a/test/RecompTest.vcxproj.filters +++ b/test/RecompTest.vcxproj.filters @@ -30234,6 +30234,9 @@ Source Files + + Source Files + diff --git a/test/portultra/events.cpp b/test/portultra/events.cpp index be7a413..e57af26 100644 --- a/test/portultra/events.cpp +++ b/test/portultra/events.cpp @@ -15,6 +15,7 @@ #include "ultra64.h" #include "multilibultra.hpp" #include "recomp.h" +#include "../src/rsp.h" struct SpTaskAction { OSTask task; @@ -203,6 +204,44 @@ int sdl_event_filter(void* userdata, SDL_Event* event) { return 1; } +uint8_t dmem[0x1000]; +uint16_t rspReciprocals[512]; +uint16_t rspInverseSquareRoots[512]; + +using RspUcodeFunc = RspExitReason(uint8_t* rdram); +extern RspUcodeFunc njpgdspMain; + +// From Ares emulator. For license details, see rsp_vu.h +void rsp_constants_init() { + rspReciprocals[0] = u16(~0); + for (u16 index = 1; index < 512; index++) { + u64 a = index + 512; + u64 b = (u64(1) << 34) / a; + rspReciprocals[index] = u16(b + 1 >> 8); + } + + for (u16 index = 0; index < 512; index++) { + u64 a = index + 512 >> ((index % 2 == 1) ? 1 : 0); + u64 b = 1 << 17; + //find the largest b where b < 1.0 / sqrt(a) + while (a * (b + 1) * (b + 1) < (u64(1) << 44)) b++; + rspInverseSquareRoots[index] = u16(b >> 1); + } +} + +// Runs a recompiled RSP microcode +void run_rsp_microcode(uint8_t* rdram, const OSTask* task, RspUcodeFunc* ucode_func) { + // Load the OSTask into DMEM + memcpy(&dmem[0xFC0], task, sizeof(OSTask)); + // Load the ucode data into DMEM + dma_rdram_to_dmem(rdram, 0x0000, task->t.ucode_data, 0xF80 - 1); + // Run the ucode + RspExitReason exit_reason = ucode_func(rdram); + // Ensure that the ucode exited correctly + assert(exit_reason == RspExitReason::Broke); + sp_complete(); +} + void event_thread_func(uint8_t* rdram, uint8_t* rom) { using namespace std::chrono_literals; if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_JOYSTICK) < 0) { @@ -216,6 +255,8 @@ void event_thread_func(uint8_t* rdram, uint8_t* rom) { SDL_SetWindowTitle(window, "Recomp"); //SDL_SetEventFilter(sdl_event_filter, nullptr); + rsp_constants_init(); + while (true) { // Try to pull an action from the queue Action action; @@ -230,20 +271,7 @@ void event_thread_func(uint8_t* rdram, uint8_t* rom) { } else if (task_action->task.t.type == M_AUDTASK) { sp_complete(); } else if (task_action->task.t.type == M_NJPEGTASK) { - uint32_t* jpeg_task = TO_PTR(uint32_t, (int32_t)(0x80000000 | task_action->task.t.data_ptr)); - int32_t address = jpeg_task[0] | 0x80000000; - size_t mbCount = jpeg_task[1]; - uint32_t mode = jpeg_task[2]; - //int32_t qTableYPtr = jpeg_task[3] | 0x80000000; - //int32_t qTableUPtr = jpeg_task[4] | 0x80000000; - //int32_t qTableVPtr = jpeg_task[5] | 0x80000000; - //uint32_t mbSize = jpeg_task[6]; - if (mode == 0) { - memset(TO_PTR(void, address), 0, mbCount * 0x40 * sizeof(uint16_t) * 4); - } else { - memset(TO_PTR(void, address), 0, mbCount * 0x40 * sizeof(uint16_t) * 6); - } - sp_complete(); + run_rsp_microcode(rdram, &task_action->task, njpgdspMain); } else { fprintf(stderr, "Unknown task type: %" PRIu32 "\n", task_action->task.t.type); assert(false); diff --git a/test/rsp/.gitignore b/test/rsp/.gitignore new file mode 100644 index 0000000..06fee25 --- /dev/null +++ b/test/rsp/.gitignore @@ -0,0 +1 @@ +njpgdspMain.cpp diff --git a/test/src/rsp.h b/test/src/rsp.h new file mode 100644 index 0000000..a5e53fb --- /dev/null +++ b/test/src/rsp.h @@ -0,0 +1,65 @@ +#ifndef __RSP_H__ +#define __RSP_H__ + +#include "rsp_vu.h" +#include "recomp.h" + +enum class RspExitReason { + Invalid, + Broke, + ImemOverrun, + UnhandledJumpTarget +}; + +extern uint8_t dmem[]; +extern uint16_t rspReciprocals[512]; +extern uint16_t rspInverseSquareRoots[512]; + +#define RSP_MEM_W(offset, addr) \ + (*reinterpret_cast(dmem + (offset) + (addr))) + +#define RSP_MEM_H(offset, addr) \ + (*reinterpret_cast(dmem + (((offset) + (addr)) ^ 2))) + +#define RSP_MEM_HU(offset, addr) \ + (*reinterpret_cast(dmem + (((offset) + (addr)) ^ 2))) + +#define RSP_MEM_B(offset, addr) \ + (*reinterpret_cast(dmem + (((offset) + (addr)) ^ 3))) + +#define RSP_MEM_BU(offset, addr) \ + (*reinterpret_cast(dmem + (((offset) + (addr)) ^ 3))) + +#define RSP_ADD32(a, b) \ + ((int32_t)((a) + (b))) + +#define RSP_SUB32(a, b) \ + ((int32_t)((a) - (b))) + +#define RSP_SIGNED(val) \ + ((int32_t)(val)) + +#define SET_DMA_DMEM(dmem_addr) dma_dmem_address = (dmem_addr) +#define SET_DMA_DRAM(dram_addr) dma_dram_address = (dram_addr) +#define DO_DMA_READ(rd_len) dma_rdram_to_dmem(rdram, dma_dmem_address, dma_dram_address, (rd_len)) +#define DO_DMA_WRITE(wr_len) dma_dmem_to_rdram(rdram, dma_dmem_address, dma_dram_address, (wr_len)) + +static inline void dma_rdram_to_dmem(uint8_t* rdram, uint32_t dmem_addr, uint32_t dram_addr, uint32_t rd_len) { + rd_len += 1; // Read length is inclusive + dram_addr &= 0xFFFFF8; + assert(dmem_addr + rd_len <= 0x1000); + for (uint32_t i = 0; i < rd_len; i++) { + RSP_MEM_B(i, dmem_addr) = MEM_B(0, (int64_t)(int32_t)(dram_addr + i + 0x80000000)); + } +} + +static inline void dma_dmem_to_rdram(uint8_t* rdram, uint32_t dmem_addr, uint32_t dram_addr, uint32_t wr_len) { + wr_len += 1; // Write length is inclusive + dram_addr &= 0xFFFFF8; + assert(dmem_addr + wr_len <= 0x1000); + for (uint32_t i = 0; i < wr_len; i++) { + MEM_B(0, (int64_t)(int32_t)(dram_addr + i + 0x80000000)) = RSP_MEM_B(i, dmem_addr); + } +} + +#endif diff --git a/test/src/rsp_vu.h b/test/src/rsp_vu.h new file mode 100644 index 0000000..8ec9b3b --- /dev/null +++ b/test/src/rsp_vu.h @@ -0,0 +1,199 @@ +// This file is modified from the Ares N64 emulator core. Ares can +// be found at https://github.com/ares-emulator/ares. The original license +// for this portion of Ares is as follows: +// ---------------------------------------------------------------------- +// ares +// +// Copyright(c) 2004 - 2021 ares team, Near et al +// +// Permission to use, copy, modify, and /or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright noticeand this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS.IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// ---------------------------------------------------------------------- +#include + +#define ARCHITECTURE_AMD64 +#define ARCHITECTURE_SUPPORTS_SSE4_1 1 + +#if defined(ARCHITECTURE_AMD64) +#include +using v128 = __m128i; +#elif defined(ARCHITECTURE_ARM64) +#include +using v128 = __m128i; +#endif + +namespace Accuracy { + namespace RSP { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + constexpr bool SISD = false; + constexpr bool SIMD = true; +#else + constexpr bool SISD = true; + constexpr bool SIMD = false; +#endif + } +} + +using u8 = uint8_t; +using s8 = int8_t; +using u16 = uint16_t; +using s16 = int16_t; +using u32 = uint32_t; +using s32 = int32_t; +using u64 = uint64_t; +using s64 = int64_t; +using uint128_t = uint64_t[2]; + +template inline auto sclamp(s64 x) -> s64 { + enum : s64 { b = 1ull << (bits - 1), m = b - 1 }; + return (x > m) ? m : (x < -b) ? -b : x; +} + +struct RSP { + using r32 = uint32_t; + using cr32 = const r32; + + union r128 { + struct { uint64_t u128[2]; }; +#if ARCHITECTURE_SUPPORTS_SSE4_1 + struct { __m128i v128; }; + + operator __m128i() const { return v128; } + auto operator=(__m128i value) { v128 = value; } +#endif + + auto byte(u32 index) -> uint8_t& { return ((uint8_t*)&u128)[15 - index]; } + auto byte(u32 index) const -> uint8_t { return ((uint8_t*)&u128)[15 - index]; } + + auto element(u32 index) -> uint16_t& { return ((uint16_t*)&u128)[7 - index]; } + auto element(u32 index) const -> uint16_t { return ((uint16_t*)&u128)[7 - index]; } + + auto u8(u32 index) -> uint8_t& { return ((uint8_t*)&u128)[15 - index]; } + auto u8(u32 index) const -> uint8_t { return ((uint8_t*)&u128)[15 - index]; } + + auto s16(u32 index) -> int16_t& { return ((int16_t*)&u128)[7 - index]; } + auto s16(u32 index) const -> int16_t { return ((int16_t*)&u128)[7 - index]; } + + auto u16(u32 index) -> uint16_t& { return ((uint16_t*)&u128)[7 - index]; } + auto u16(u32 index) const -> uint16_t { return ((uint16_t*)&u128)[7 - index]; } + + //VCx registers + auto get(u32 index) const -> bool { return u16(index) != 0; } + auto set(u32 index, bool value) -> bool { return u16(index) = 0 - value, value; } + + //vu-registers.cpp + auto operator()(u32 index) const -> r128; + }; + using cr128 = const r128; + + struct VU { + r128 r[32]; + r128 acch, accm, accl; + r128 vcoh, vcol; //16-bit little endian + r128 vcch, vccl; //16-bit little endian + r128 vce; // 8-bit little endian + s16 divin; + s16 divout; + bool divdp; + } vpu; + + static constexpr r128 zero{0}; + static constexpr r128 invert{(uint64_t)-1, (uint64_t)-1}; + + auto accumulatorGet(u32 index) const -> u64; + auto accumulatorSet(u32 index, u64 value) -> void; + auto accumulatorSaturate(u32 index, bool slice, u16 negative, u16 positive) const -> u16; + + auto CFC2(r32& rt, u8 rd) -> void; + auto CTC2(cr32& rt, u8 rd) -> void; + template auto LBV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LDV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LFV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LHV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LLV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LPV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LQV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LRV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LSV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LTV(u8 vt, cr32& rs, s8 imm) -> void; + template auto LUV(r128& vt, cr32& rs, s8 imm) -> void; + template auto LWV(r128& vt, cr32& rs, s8 imm) -> void; + template auto MFC2(r32& rt, cr128& vs) -> void; + template auto MTC2(cr32& rt, r128& vs) -> void; + template auto SBV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SDV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SFV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SHV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SLV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SPV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SQV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SRV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SSV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto STV(u8 vt, cr32& rs, s8 imm) -> void; + template auto SUV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto SWV(cr128& vt, cr32& rs, s8 imm) -> void; + template auto VABS(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VADD(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VADDC(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VAND(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VCH(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VCL(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VCR(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VEQ(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VGE(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VLT(r128& vd, cr128& vs, cr128& vt) -> void; + template + auto VMACF(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMACF(r128& vd, cr128& vs, cr128& vt) -> void { VMACF<0, e>(vd, vs, vt); } + template auto VMACU(r128& vd, cr128& vs, cr128& vt) -> void { VMACF<1, e>(vd, vs, vt); } + auto VMACQ(r128& vd) -> void; + template auto VMADH(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMADL(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMADM(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMADN(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMOV(r128& vd, u8 de, cr128& vt) -> void; + template auto VMRG(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMUDH(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMUDL(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMUDM(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VMUDN(r128& vd, cr128& vs, cr128& vt) -> void; + template + auto VMULF(r128& rd, cr128& vs, cr128& vt) -> void; + template auto VMULF(r128& rd, cr128& vs, cr128& vt) -> void { VMULF<0, e>(rd, vs, vt); } + template auto VMULU(r128& rd, cr128& vs, cr128& vt) -> void { VMULF<1, e>(rd, vs, vt); } + template auto VMULQ(r128& rd, cr128& vs, cr128& vt) -> void; + template auto VNAND(r128& rd, cr128& vs, cr128& vt) -> void; + template auto VNE(r128& vd, cr128& vs, cr128& vt) -> void; + auto VNOP() -> void; + template auto VNOR(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VNXOR(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VOR(r128& vd, cr128& vs, cr128& vt) -> void; + template + auto VRCP(r128& vd, u8 de, cr128& vt) -> void; + template auto VRCP(r128& vd, u8 de, cr128& vt) -> void { VRCP<0, e>(vd, de, vt); } + template auto VRCPL(r128& vd, u8 de, cr128& vt) -> void { VRCP<1, e>(vd, de, vt); } + template auto VRCPH(r128& vd, u8 de, cr128& vt) -> void; + template + auto VRND(r128& vd, u8 vs, cr128& vt) -> void; + template auto VRNDN(r128& vd, u8 vs, cr128& vt) -> void { VRND<0, e>(vd, vs, vt); } + template auto VRNDP(r128& vd, u8 vs, cr128& vt) -> void { VRND<1, e>(vd, vs, vt); } + template + auto VRSQ(r128& vd, u8 de, cr128& vt) -> void; + template auto VRSQ(r128& vd, u8 de, cr128& vt) -> void { VRSQ<0, e>(vd, de, vt); } + template auto VRSQL(r128& vd, u8 de, cr128& vt) -> void { VRSQ<1, e>(vd, de, vt); } + template auto VRSQH(r128& vd, u8 de, cr128& vt) -> void; + template auto VSAR(r128& vd, cr128& vs) -> void; + template auto VSUB(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VSUBC(r128& vd, cr128& vs, cr128& vt) -> void; + template auto VXOR(r128& rd, cr128& vs, cr128& vt) -> void; + template auto VZERO(r128& rd, cr128& vs, cr128& vt) -> void; +}; diff --git a/test/src/rsp_vu_impl.h b/test/src/rsp_vu_impl.h new file mode 100644 index 0000000..8c22d14 --- /dev/null +++ b/test/src/rsp_vu_impl.h @@ -0,0 +1,1537 @@ +// This file is modified from the Ares N64 emulator core. Ares can +// be found at https://github.com/ares-emulator/ares. The original license +// for this portion of Ares is as follows: +// ---------------------------------------------------------------------- +// ares +// +// Copyright(c) 2004 - 2021 ares team, Near et al +// +// Permission to use, copy, modify, and /or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright noticeand this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS.IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// ---------------------------------------------------------------------- + +#include +#include +using u32 = uint32_t; + +#define ACCH vpu.acch +#define ACCM vpu.accm +#define ACCL vpu.accl +#define VCOH vpu.vcoh +#define VCOL vpu.vcol +#define VCCH vpu.vcch +#define VCCL vpu.vccl +#define VCE vpu.vce + +#define DIVIN vpu.divin +#define DIVOUT vpu.divout +#define DIVDP vpu.divdp + +auto RSP::r128::operator()(u32 index) const -> r128 { + if constexpr (Accuracy::RSP::SISD) { + r128 v{ *this }; + switch (index) { + case 0: break; + case 1: break; + case 2: v.u16(1) = v.u16(0); v.u16(3) = v.u16(2); v.u16(5) = v.u16(4); v.u16(7) = v.u16(6); break; + case 3: v.u16(0) = v.u16(1); v.u16(2) = v.u16(3); v.u16(4) = v.u16(5); v.u16(6) = v.u16(7); break; + case 4: v.u16(1) = v.u16(2) = v.u16(3) = v.u16(0); v.u16(5) = v.u16(6) = v.u16(7) = v.u16(4); break; + case 5: v.u16(0) = v.u16(2) = v.u16(3) = v.u16(1); v.u16(4) = v.u16(6) = v.u16(7) = v.u16(5); break; + case 6: v.u16(0) = v.u16(1) = v.u16(3) = v.u16(2); v.u16(4) = v.u16(5) = v.u16(7) = v.u16(6); break; + case 7: v.u16(0) = v.u16(1) = v.u16(2) = v.u16(3); v.u16(4) = v.u16(5) = v.u16(6) = v.u16(7); break; + case 8: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(0); break; + case 9: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(1); break; + case 10: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(2); break; + case 11: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(3); break; + case 12: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(4); break; + case 13: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(5); break; + case 14: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(6); break; + case 15: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(7); break; + } + return v; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + static const __m128i shuffle[16] = { + //vector + _mm_set_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), //01234567 + _mm_set_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), //01234567 + //scalar quarter + _mm_set_epi8(15,14,15,14,11,10,11,10, 7, 6, 7, 6, 3, 2, 3, 2), //00224466 + _mm_set_epi8(13,12,13,12, 9, 8, 9, 8, 5, 4, 5, 4, 1, 0, 1, 0), //11335577 + //scalar half + _mm_set_epi8(15,14,15,14,15,14,15,14, 7, 6, 7, 6, 7, 6, 7, 6), //00004444 + _mm_set_epi8(13,12,13,12,13,12,13,12, 5, 4, 5, 4, 5, 4, 5, 4), //11115555 + _mm_set_epi8(11,10,11,10,11,10,11,10, 3, 2, 3, 2, 3, 2, 3, 2), //22226666 + _mm_set_epi8(9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0), //33337777 + //scalar whole + _mm_set_epi8(15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14), //00000000 + _mm_set_epi8(13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12), //11111111 + _mm_set_epi8(11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10), //22222222 + _mm_set_epi8(9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8), //33333333 + _mm_set_epi8(7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6), //44444444 + _mm_set_epi8(5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4), //55555555 + _mm_set_epi8(3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2), //66666666 + _mm_set_epi8(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0), //77777777 + }; + //todo: benchmark to see if testing for cases 0&1 to return value directly is faster + r128 ret; + ret.v128 = _mm_shuffle_epi8(v128, shuffle[index]); + return ret; +#endif + } +} + +auto RSP::accumulatorGet(u32 index) const -> u64 { + return (u64)ACCH.u16(index) << 32 | (u64)ACCM.u16(index) << 16 | (u64)ACCL.u16(index) << 0; +} + +auto RSP::accumulatorSet(u32 index, u64 value) -> void { + ACCH.u16(index) = value >> 32; + ACCM.u16(index) = value >> 16; + ACCL.u16(index) = value >> 0; +} + +auto RSP::accumulatorSaturate(u32 index, bool slice, u16 negative, u16 positive) const -> u16 { + if (ACCH.s16(index) < 0) { + if (ACCH.u16(index) != 0xffff) return negative; + if (ACCM.s16(index) >= 0) return negative; + } else { + if (ACCH.u16(index) != 0x0000) return positive; + if (ACCM.s16(index) < 0) return positive; + } + return !slice ? ACCL.u16(index) : ACCM.u16(index); +} + +auto RSP::CFC2(r32& rt, u8 rd) -> void { + r128 hi, lo; + switch (rd & 3) { + case 0x00: hi = VCOH; lo = VCOL; break; + case 0x01: hi = VCCH; lo = VCCL; break; + case 0x02: hi = zero; lo = VCE; break; + case 0x03: hi = zero; lo = VCE; break; //unverified + } + + if constexpr (Accuracy::RSP::SISD) { + rt = 0; + for (u32 n = 0; n < 8; n++) { + rt |= lo.get(n) << 0 + n; + rt |= hi.get(n) << 8 + n; + } + rt = s16(rt); + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + static const v128 reverse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + rt = s16(_mm_movemask_epi8(_mm_shuffle_epi8(_mm_packs_epi16(hi, lo), reverse))); +#endif + } +} + +auto RSP::CTC2(cr32& rt, u8 rd) -> void { + r128* hi; r128* lo; + r128 null; + switch (rd & 3) { + case 0x00: hi = &VCOH; lo = &VCOL; break; + case 0x01: hi = &VCCH; lo = &VCCL; break; + case 0x02: hi = &null; lo = &VCE; break; + case 0x03: hi = &null; lo = &VCE; break; //unverified + } + + if constexpr (Accuracy::RSP::SISD) { + for (u32 n = 0; n < 8; n++) { + lo->set(n, rt & 1 << 0 + n); + hi->set(n, rt & 1 << 8 + n); + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + static const v128 mask = _mm_set_epi16(0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080); + lo->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{ ~rt >> 0 }, zero), mask), zero); + hi->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{ ~rt >> 8 }, zero), mask), zero); +#endif + } +} + +template +auto RSP::LBV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm; + vt.byte(e) = RSP_MEM_B(0, address); +} + +template +auto RSP::LDV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 8; + auto start = e; + auto end = std::min(start + 8, 16); + for (u32 offset = start; offset < end; offset++) { + vt.byte(offset & 15) = RSP_MEM_B(0, address++); + } +} + +template +auto RSP::LFV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto index = (address & 7) - e; + address &= ~7; + auto start = e; + auto end = std::min(start + 8, 16); + r128 tmp; + for (u32 offset = 0; offset < 4; offset++) { + tmp.element(offset + 0) = RSP_MEM_B(0, address + (index + offset * 4 + 0 & 15)) << 7; + tmp.element(offset + 4) = RSP_MEM_B(0, address + (index + offset * 4 + 8 & 15)) << 7; + } + for (u32 offset = start; offset < end; offset++) { + vt.byte(offset) = tmp.byte(offset); + } +} + +template +auto RSP::LHV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto index = (address & 7) - e; + address &= ~7; + for (u32 offset = 0; offset < 8; offset++) { + vt.element(offset) = RSP_MEM_B(0, address + (index + offset * 2 & 15)) << 7; + } +} + +template +auto RSP::LLV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 4; + auto start = e; + auto end = std::min(start + 4, 16); + for (u32 offset = start; offset < end; offset++) { + vt.byte(offset & 15) = RSP_MEM_B(0, address++); + } +} + +template +auto RSP::LPV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 8; + auto index = (address & 7) - e; + address &= ~7; + for (u32 offset = 0; offset < 8; offset++) { + vt.element(offset) = RSP_MEM_B(0, address + (index + offset & 15)) << 8; + } +} + +template +auto RSP::LQV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto start = e; + auto end = std::min((u32)(16 + e - (address & 15)), (u32)16); + for (u32 offset = start; offset < end; offset++) { + vt.byte(offset & 15) = RSP_MEM_B(0, address++); + } +} + +template +auto RSP::LRV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto index = e; + auto start = 16 - ((address & 15) - index); + address &= ~15; + for (u32 offset = start; offset < 16; offset++) { + vt.byte(offset & 15) = RSP_MEM_B(0, address++); + } +} + +template +auto RSP::LSV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 2; + auto start = e; + auto end = std::min(start + 2, 16); + for (u32 offset = start; offset < end; offset++) { + vt.byte(offset & 15) = RSP_MEM_B(0, address++); + } +} + +template +auto RSP::LTV(u8 vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto begin = address & ~7; + address = begin + ((e + (address & 8)) & 15); + auto vtbase = vt & ~7; + auto vtoff = e >> 1; + for (u32 i = 0; i < 8; i++) { + vpu.r[vtbase + vtoff].byte(i * 2 + 0) = RSP_MEM_B(0, address++); + if (address == begin + 16) address = begin; + vpu.r[vtbase + vtoff].byte(i * 2 + 1) = RSP_MEM_B(0, address++); + if (address == begin + 16) address = begin; + vtoff = vtoff + 1 & 7; + } +} + +template +auto RSP::LUV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 8; + auto index = (address & 7) - e; + address &= ~7; + for (u32 offset = 0; offset < 8; offset++) { + vt.element(offset) = RSP_MEM_B(0, address + (index + offset & 15)) << 7; + } +} + +template +auto RSP::LWV(r128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto start = 16 - e; + auto end = e + 16; + for (u32 offset = start; offset < end; offset++) { + vt.byte(offset & 15) = RSP_MEM_B(0, address); + address += 4; + } +} + +template +auto RSP::MFC2(r32& rt, cr128& vs) -> void { + auto hi = vs.byte(e + 0 & 15); + auto lo = vs.byte(e + 1 & 15); + rt = s16(hi << 8 | lo << 0); +} + +template +auto RSP::MTC2(cr32& rt, r128& vs) -> void { + vs.byte(e + 0) = rt >> 8; + if (e != 15) vs.byte(e + 1) = rt >> 0; +} + +template +auto RSP::SBV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm; + RSP_MEM_B(0, address) = vt.byte(e); +} + +template +auto RSP::SDV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 8; + auto start = e; + auto end = start + 8; + for (u32 offset = start; offset < end; offset++) { + RSP_MEM_B(0, address++) = vt.byte(offset & 15); + } +} + +template +auto RSP::SFV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto base = address & 7; + address &= ~7; + switch (e) { + case 0: case 15: + RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(0) >> 7; + RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(1) >> 7; + RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(2) >> 7; + RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(3) >> 7; + break; + case 1: + RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(6) >> 7; + RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(7) >> 7; + RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(4) >> 7; + RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(5) >> 7; + break; + case 4: + RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(1) >> 7; + RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(2) >> 7; + RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(3) >> 7; + RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(0) >> 7; + break; + case 5: + RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(7) >> 7; + RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(4) >> 7; + RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(5) >> 7; + RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(6) >> 7; + break; + case 8: + RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(4) >> 7; + RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(5) >> 7; + RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(6) >> 7; + RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(7) >> 7; + break; + case 11: + RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(3) >> 7; + RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(0) >> 7; + RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(1) >> 7; + RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(2) >> 7; + break; + case 12: + RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(5) >> 7; + RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(6) >> 7; + RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(7) >> 7; + RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(4) >> 7; + break; + default: + RSP_MEM_B(0, address + (base + 0 & 15)) = 0; + RSP_MEM_B(0, address + (base + 4 & 15)) = 0; + RSP_MEM_B(0, address + (base + 8 & 15)) = 0; + RSP_MEM_B(0, address + (base + 12 & 15)) = 0; + break; + } +} + +template +auto RSP::SHV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto index = address & 7; + address &= ~7; + for (u32 offset = 0; offset < 8; offset++) { + auto byte = e + offset * 2; + auto value = vt.byte(byte + 0 & 15) << 1 | vt.byte(byte + 1 & 15) >> 7; + RSP_MEM_B(0, address + (index + offset * 2 & 15)) = value; + } +} + +template +auto RSP::SLV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 4; + auto start = e; + auto end = start + 4; + for (u32 offset = start; offset < end; offset++) { + RSP_MEM_B(0, address++) = vt.byte(offset & 15); + } +} + +template +auto RSP::SPV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 8; + auto start = e; + auto end = start + 8; + for (u32 offset = start; offset < end; offset++) { + if ((offset & 15) < 8) { + RSP_MEM_B(0, address++) = vt.byte((offset & 7) << 1); + } else { + RSP_MEM_B(0, address++) = vt.element(offset & 7) >> 7; + } + } +} + +template +auto RSP::SQV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto start = e; + auto end = start + (16 - (address & 15)); + for (u32 offset = start; offset < end; offset++) { + RSP_MEM_B(0, address++) = vt.byte(offset & 15); + } +} + +template +auto RSP::SRV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto start = e; + auto end = start + (address & 15); + auto base = 16 - (address & 15); + address &= ~15; + for (u32 offset = start; offset < end; offset++) { + RSP_MEM_B(0, address++) = vt.byte(offset + base & 15); + } +} + +template +auto RSP::SSV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 2; + auto start = e; + auto end = start + 2; + for (u32 offset = start; offset < end; offset++) { + RSP_MEM_B(0, address++) = vt.byte(offset & 15); + } +} + +template +auto RSP::STV(u8 vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto start = vt & ~7; + auto end = start + 8; + auto element = 16 - (e & ~1); + auto base = (address & 7) - (e & ~1); + address &= ~7; + for (u32 offset = start; offset < end; offset++) { + RSP_MEM_B(0, address + (base++ & 15)) = vpu.r[offset].byte(element++ & 15); + RSP_MEM_B(0, address + (base++ & 15)) = vpu.r[offset].byte(element++ & 15); + } +} + +template +auto RSP::SUV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 8; + auto start = e; + auto end = start + 8; + for (u32 offset = start; offset < end; offset++) { + if ((offset & 15) < 8) { + RSP_MEM_B(0, address++) = vt.element(offset & 7) >> 7; + } else { + RSP_MEM_B(0, address++) = vt.byte((offset & 7) << 1); + } + } +} + +template +auto RSP::SWV(cr128& vt, cr32& rs, s8 imm) -> void { + auto address = rs + imm * 16; + auto start = e; + auto end = start + 16; + auto base = address & 7; + address &= ~7; + for (u32 offset = start; offset < end; offset++) { + RSP_MEM_B(0, address + (base++ & 15)) = vt.byte(offset & 15); + } +} + +template +auto RSP::VABS(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + r128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + if (vs.s16(n) < 0) { + if (vte.s16(n) == -32768) { + ACCL.s16(n) = -32768; + vd.s16(n) = 32767; + } else { + ACCL.s16(n) = -vte.s16(n); + vd.s16(n) = -vte.s16(n); + } + } else if (vs.s16(n) > 0) { + ACCL.s16(n) = +vte.s16(n); + vd.s16(n) = +vte.s16(n); + } else { + ACCL.s16(n) = 0; + vd.s16(n) = 0; + } + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vs0, slt; + vs0 = _mm_cmpeq_epi16(vs, zero); + slt = _mm_srai_epi16(vs, 15); + vd = _mm_andnot_si128(vs0, vt(e)); + vd = _mm_xor_si128(vd, slt); + ACCL = _mm_sub_epi16(vd, slt); + vd = _mm_subs_epi16(vd, slt); +#endif + } +} + +template +auto RSP::VADD(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + s32 result = vs.s16(n) + vte.s16(n) + VCOL.get(n); + ACCL.s16(n) = result; + vd.s16(n) = sclamp<16>(result); + } + VCOL = zero; + VCOH = zero; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), sum, min, max; + sum = _mm_add_epi16(vs, vte); + ACCL = _mm_sub_epi16(sum, VCOL); + min = _mm_min_epi16(vs, vte); + max = _mm_max_epi16(vs, vte); + min = _mm_subs_epi16(min, VCOL); + vd = _mm_adds_epi16(min, max); + VCOL = zero; + VCOH = zero; +#endif + } +} + +template +auto RSP::VADDC(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + u32 result = vs.u16(n) + vte.u16(n); + ACCL.u16(n) = result; + VCOL.set(n, result >> 16); + } + VCOH = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), sum; + sum = _mm_adds_epu16(vs, vte); + ACCL = _mm_add_epi16(vs, vte); + VCOL = _mm_cmpeq_epi16(sum, ACCL); + VCOL = _mm_cmpeq_epi16(VCOL, zero); + VCOH = zero; + vd = ACCL; +#endif + } +} + +template +auto RSP::VAND(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + r128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = vs.u16(n) & vte.u16(n); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_and_si128(vs, vt(e)); + vd = ACCL; +#endif + } +} + +template +auto RSP::VCH(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + if ((vs.s16(n) ^ vte.s16(n)) < 0) { + s16 result = vs.s16(n) + vte.s16(n); + ACCL.s16(n) = (result <= 0 ? -vte.s16(n) : vs.s16(n)); + VCCL.set(n, result <= 0); + VCCH.set(n, vte.s16(n) < 0); + VCOL.set(n, 1); + VCOH.set(n, result != 0 && vs.u16(n) != (vte.u16(n) ^ 0xffff)); + VCE.set(n, result == -1); + } else { + s16 result = vs.s16(n) - vte.s16(n); + ACCL.s16(n) = (result >= 0 ? vte.s16(n) : vs.s16(n)); + VCCL.set(n, vte.s16(n) < 0); + VCCH.set(n, result >= 0); + VCOL.set(n, 0); + VCOH.set(n, result != 0 && vs.u16(n) != (vte.u16(n) ^ 0xffff)); + VCE.set(n, 0); + } + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), nvt, diff, diff0, vtn, dlez, dgez, mask; + VCOL = _mm_xor_si128(vs, vte); + VCOL = _mm_cmplt_epi16(VCOL, zero); + nvt = _mm_xor_si128(vte, VCOL); + nvt = _mm_sub_epi16(nvt, VCOL); + diff = _mm_sub_epi16(vs, nvt); + diff0 = _mm_cmpeq_epi16(diff, zero); + vtn = _mm_cmplt_epi16(vte, zero); + dlez = _mm_cmpgt_epi16(diff, zero); + dgez = _mm_or_si128(dlez, diff0); + dlez = _mm_cmpeq_epi16(zero, dlez); + VCCH = _mm_blendv_epi8(dgez, vtn, VCOL); + VCCL = _mm_blendv_epi8(vtn, dlez, VCOL); + VCE = _mm_cmpeq_epi16(diff, VCOL); + VCE = _mm_and_si128(VCE, VCOL); + VCOH = _mm_or_si128(diff0, VCE); + VCOH = _mm_cmpeq_epi16(VCOH, zero); + mask = _mm_blendv_epi8(VCCH, VCCL, VCOL); + ACCL = _mm_blendv_epi8(vs, nvt, mask); + vd = ACCL; +#endif + } +} + +template +auto RSP::VCL(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + if (VCOL.get(n)) { + if (VCOH.get(n)) { + ACCL.u16(n) = VCCL.get(n) ? -vte.u16(n) : vs.u16(n); + } else { + u16 sum = vs.u16(n) + vte.u16(n); + bool carry = (vs.u16(n) + vte.u16(n)) != sum; + if (VCE.get(n)) { + ACCL.u16(n) = VCCL.set(n, (!sum || !carry)) ? -vte.u16(n) : vs.u16(n); + } else { + ACCL.u16(n) = VCCL.set(n, (!sum && !carry)) ? -vte.u16(n) : vs.u16(n); + } + } + } else { + if (VCOH.get(n)) { + ACCL.u16(n) = VCCH.get(n) ? vte.u16(n) : vs.u16(n); + } else { + ACCL.u16(n) = VCCH.set(n, (s32)vs.u16(n) - (s32)vte.u16(n) >= 0) ? vte.u16(n) : vs.u16(n); + } + } + } + VCOL = zero; + VCOH = zero; + VCE = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), nvt, diff, ncarry, nvce, diff0, lec1, lec2, leeq, geeq, le, ge, mask; + nvt = _mm_xor_si128(vte, VCOL); + nvt = _mm_sub_epi16(nvt, VCOL); + diff = _mm_sub_epi16(vs, nvt); + ncarry = _mm_adds_epu16(vs, vte); + ncarry = _mm_cmpeq_epi16(diff, ncarry); + nvce = _mm_cmpeq_epi16(VCE, zero); + diff0 = _mm_cmpeq_epi16(diff, zero); + lec1 = _mm_and_si128(diff0, ncarry); + lec1 = _mm_and_si128(nvce, lec1); + lec2 = _mm_or_si128(diff0, ncarry); + lec2 = _mm_and_si128(VCE, lec2); + leeq = _mm_or_si128(lec1, lec2); + geeq = _mm_subs_epu16(vte, vs); + geeq = _mm_cmpeq_epi16(geeq, zero); + le = _mm_andnot_si128(VCOH, VCOL); + le = _mm_blendv_epi8(VCCL, leeq, le); + ge = _mm_or_si128(VCOL, VCOH); + ge = _mm_blendv_epi8(geeq, VCCH, ge); + mask = _mm_blendv_epi8(ge, le, VCOL); + ACCL = _mm_blendv_epi8(vs, nvt, mask); + VCCH = ge; + VCCL = le; + VCOH = zero; + VCOL = zero; + VCE = zero; + vd = ACCL; +#endif + } +} + +template +auto RSP::VCR(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + if ((vs.s16(n) ^ vte.s16(n)) < 0) { + VCCH.set(n, vte.s16(n) < 0); + ACCL.u16(n) = VCCL.set(n, vs.s16(n) + vte.s16(n) + 1 <= 0) ? ~vte.u16(n) : vs.u16(n); + } else { + VCCL.set(n, vte.s16(n) < 0); + ACCL.u16(n) = VCCH.set(n, vs.s16(n) - vte.s16(n) >= 0) ? vte.u16(n) : vs.u16(n); + } + } + VCOL = zero; + VCOH = zero; + VCE = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), sign, dlez, dgez, nvt, mask; + sign = _mm_xor_si128(vs, vte); + sign = _mm_srai_epi16(sign, 15); + dlez = _mm_and_si128(vs, sign); + dlez = _mm_add_epi16(dlez, vte); + VCCL = _mm_srai_epi16(dlez, 15); + dgez = _mm_or_si128(vs, sign); + dgez = _mm_min_epi16(dgez, vte); + VCCH = _mm_cmpeq_epi16(dgez, vte); + nvt = _mm_xor_si128(vte, sign); + mask = _mm_blendv_epi8(VCCH, VCCL, sign); + ACCL = _mm_blendv_epi8(vs, nvt, mask); + vd = ACCL; + VCOL = zero; + VCOH = zero; + VCE = zero; +#endif + } +} + +template +auto RSP::VEQ(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = VCCL.set(n, !VCOH.get(n) && vs.u16(n) == vte.u16(n)) ? vs.u16(n) : vte.u16(n); + } + VCCH = zero; //unverified + VCOL = zero; + VCOH = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), eq; + eq = _mm_cmpeq_epi16(vs, vte); + VCCL = _mm_andnot_si128(VCOH, eq); + ACCL = _mm_blendv_epi8(vte, vs, VCCL); + VCCH = zero; //unverified + VCOH = zero; + VCOL = zero; + vd = ACCL; +#endif + } +} + +template +auto RSP::VGE(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = VCCL.set(n, vs.s16(n) > vte.s16(n) || (vs.s16(n) == vte.s16(n) && (!VCOL.get(n) || !VCOH.get(n)))) ? vs.u16(n) : vte.u16(n); + } + VCCH = zero; //unverified + VCOL = zero; + VCOH = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), eq, gt, es; + eq = _mm_cmpeq_epi16(vs, vte); + gt = _mm_cmpgt_epi16(vs, vte); + es = _mm_and_si128(VCOH, VCOL); + eq = _mm_andnot_si128(es, eq); + VCCL = _mm_or_si128(gt, eq); + ACCL = _mm_blendv_epi8(vte, vs, VCCL); + VCCH = zero; + VCOH = zero; + VCOL = zero; + vd = ACCL; +#endif + } +} + +template +auto RSP::VLT(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = VCCL.set(n, vs.s16(n) < vte.s16(n) || (vs.s16(n) == vte.s16(n) && VCOL.get(n) && VCOH.get(n))) ? vs.u16(n) : vte.u16(n); + } + VCCH = zero; + VCOL = zero; + VCOH = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), eq, lt; + eq = _mm_cmpeq_epi16(vs, vte); + lt = _mm_cmplt_epi16(vs, vte); + eq = _mm_and_si128(VCOH, eq); + eq = _mm_and_si128(VCOL, eq); + VCCL = _mm_or_si128(lt, eq); + ACCL = _mm_blendv_epi8(vte, vs, VCCL); + VCCH = zero; + VCOH = zero; + VCOL = zero; + vd = ACCL; +#endif + } +} + +template +auto RSP::VMACF(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, accumulatorGet(n) + (s64)vs.s16(n) * (s64)vte.s16(n) * 2); + if constexpr (U == 0) { + vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); + } + if constexpr (U == 1) { + vd.u16(n) = ACCH.s16(n) < 0 ? 0x0000 : ACCH.s16(n) || ACCM.s16(n) < 0 ? 0xffff : ACCM.u16(n); + } + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), lo, md, hi, carry, omask; + lo = _mm_mullo_epi16(vs, vte); + hi = _mm_mulhi_epi16(vs, vte); + md = _mm_slli_epi16(hi, 1); + carry = _mm_srli_epi16(lo, 15); + hi = _mm_srai_epi16(hi, 15); + md = _mm_or_si128(md, carry); + lo = _mm_slli_epi16(lo, 1); + omask = _mm_adds_epu16(ACCL, lo); + ACCL = _mm_add_epi16(ACCL, lo); + omask = _mm_cmpeq_epi16(ACCL, omask); + omask = _mm_cmpeq_epi16(omask, zero); + md = _mm_sub_epi16(md, omask); + carry = _mm_cmpeq_epi16(md, zero); + carry = _mm_and_si128(carry, omask); + hi = _mm_sub_epi16(hi, carry); + omask = _mm_adds_epu16(ACCM, md); + ACCM = _mm_add_epi16(ACCM, md); + omask = _mm_cmpeq_epi16(ACCM, omask); + omask = _mm_cmpeq_epi16(omask, zero); + ACCH = _mm_add_epi16(ACCH, hi); + ACCH = _mm_sub_epi16(ACCH, omask); + if constexpr (!U) { + lo = _mm_unpacklo_epi16(ACCM, ACCH); + hi = _mm_unpackhi_epi16(ACCM, ACCH); + vd = _mm_packs_epi32(lo, hi); + } else { + r128 mmask, hmask; + mmask = _mm_srai_epi16(ACCM, 15); + hmask = _mm_srai_epi16(ACCH, 15); + md = _mm_or_si128(mmask, ACCM); + omask = _mm_cmpgt_epi16(ACCH, zero); + md = _mm_andnot_si128(hmask, md); + vd = _mm_or_si128(omask, md); + } +#endif + } +} + +auto RSP::VMACQ(r128& vd) -> void { + for (u32 n = 0; n < 8; n++) { + s32 product = ACCH.element(n) << 16 | ACCM.element(n) << 0; + if (product < 0 && !(product & 1 << 5)) product += 32; + else if (product >= 32 && !(product & 1 << 5)) product -= 32; + ACCH.element(n) = product >> 16; + ACCM.element(n) = product >> 0; + vd.element(n) = sclamp<16>(product >> 1) & ~15; + } +} + +template +auto RSP::VMADH(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + s32 result = (accumulatorGet(n) >> 16) + vs.s16(n) * vte.s16(n); + ACCH.u16(n) = result >> 16; + ACCM.u16(n) = result >> 0; + vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), lo, hi, omask; + lo = _mm_mullo_epi16(vs, vte); + hi = _mm_mulhi_epi16(vs, vte); + omask = _mm_adds_epu16(ACCM, lo); + ACCM = _mm_add_epi16(ACCM, lo); + omask = _mm_cmpeq_epi16(ACCM, omask); + omask = _mm_cmpeq_epi16(omask, zero); + hi = _mm_sub_epi16(hi, omask); + ACCH = _mm_add_epi16(ACCH, hi); + lo = _mm_unpacklo_epi16(ACCM, ACCH); + hi = _mm_unpackhi_epi16(ACCM, ACCH); + vd = _mm_packs_epi32(lo, hi); +#endif + } +} + +template +auto RSP::VMADL(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, accumulatorGet(n) + (u32(vs.u16(n) * vte.u16(n)) >> 16)); + vd.u16(n) = accumulatorSaturate(n, 0, 0x0000, 0xffff); + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), hi, omask, nhi, nmd, shi, smd, cmask, cval; + hi = _mm_mulhi_epu16(vs, vte); + omask = _mm_adds_epu16(ACCL, hi); + ACCL = _mm_add_epi16(ACCL, hi); + omask = _mm_cmpeq_epi16(ACCL, omask); + omask = _mm_cmpeq_epi16(omask, zero); + hi = _mm_sub_epi16(zero, omask); + omask = _mm_adds_epu16(ACCM, hi); + ACCM = _mm_add_epi16(ACCM, hi); + omask = _mm_cmpeq_epi16(ACCM, omask); + omask = _mm_cmpeq_epi16(omask, zero); + ACCH = _mm_sub_epi16(ACCH, omask); + nhi = _mm_srai_epi16(ACCH, 15); + nmd = _mm_srai_epi16(ACCM, 15); + shi = _mm_cmpeq_epi16(nhi, ACCH); + smd = _mm_cmpeq_epi16(nhi, nmd); + cmask = _mm_and_si128(smd, shi); + cval = _mm_cmpeq_epi16(nhi, zero); + vd = _mm_blendv_epi8(cval, ACCL, cmask); +#endif + } +} + +template +auto RSP::VMADM(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, accumulatorGet(n) + vs.s16(n) * vte.u16(n)); + vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), lo, hi, sign, vta, omask; + lo = _mm_mullo_epi16(vs, vte); + hi = _mm_mulhi_epu16(vs, vte); + sign = _mm_srai_epi16(vs, 15); + vta = _mm_and_si128(vte, sign); + hi = _mm_sub_epi16(hi, vta); + omask = _mm_adds_epu16(ACCL, lo); + ACCL = _mm_add_epi16(ACCL, lo); + omask = _mm_cmpeq_epi16(ACCL, omask); + omask = _mm_cmpeq_epi16(omask, zero); + hi = _mm_sub_epi16(hi, omask); + omask = _mm_adds_epu16(ACCM, hi); + ACCM = _mm_add_epi16(ACCM, hi); + omask = _mm_cmpeq_epi16(ACCM, omask); + omask = _mm_cmpeq_epi16(omask, zero); + hi = _mm_srai_epi16(hi, 15); + ACCH = _mm_add_epi16(ACCH, hi); + ACCH = _mm_sub_epi16(ACCH, omask); + lo = _mm_unpacklo_epi16(ACCM, ACCH); + hi = _mm_unpackhi_epi16(ACCM, ACCH); + vd = _mm_packs_epi32(lo, hi); +#endif + } +} + +template +auto RSP::VMADN(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, accumulatorGet(n) + s64(vs.u16(n) * vte.s16(n))); + vd.u16(n) = accumulatorSaturate(n, 0, 0x0000, 0xffff); + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), lo, hi, sign, vsa, omask, nhi, nmd, shi, smd, cmask, cval; + lo = _mm_mullo_epi16(vs, vte); + hi = _mm_mulhi_epu16(vs, vte); + sign = _mm_srai_epi16(vte, 15); + vsa = _mm_and_si128(vs, sign); + hi = _mm_sub_epi16(hi, vsa); + omask = _mm_adds_epu16(ACCL, lo); + ACCL = _mm_add_epi16(ACCL, lo); + omask = _mm_cmpeq_epi16(ACCL, omask); + omask = _mm_cmpeq_epi16(omask, zero); + hi = _mm_sub_epi16(hi, omask); + omask = _mm_adds_epu16(ACCM, hi); + ACCM = _mm_add_epi16(ACCM, hi); + omask = _mm_cmpeq_epi16(ACCM, omask); + omask = _mm_cmpeq_epi16(omask, zero); + hi = _mm_srai_epi16(hi, 15); + ACCH = _mm_add_epi16(ACCH, hi); + ACCH = _mm_sub_epi16(ACCH, omask); + nhi = _mm_srai_epi16(ACCH, 15); + nmd = _mm_srai_epi16(ACCM, 15); + shi = _mm_cmpeq_epi16(nhi, ACCH); + smd = _mm_cmpeq_epi16(nhi, nmd); + cmask = _mm_and_si128(smd, shi); + cval = _mm_cmpeq_epi16(nhi, zero); + vd = _mm_blendv_epi8(cval, ACCL, cmask); +#endif + } +} + +template +auto RSP::VMOV(r128& vd, u8 de, cr128& vt) -> void { + cr128 vte = vt(e); + vd.u16(de) = vte.u16(de); + ACCL = vte; +} + +template +auto RSP::VMRG(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = VCCL.get(n) ? vs.u16(n) : vte.u16(n); + } + VCOH = zero; + VCOL = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_blendv_epi8(vt(e), vs, VCCL); + VCOH = zero; + VCOL = zero; + vd = ACCL; +#endif + } +} + +template +auto RSP::VMUDH(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, s64(vs.s16(n) * vte.s16(n)) << 16); + vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), lo, hi; + ACCL = zero; + ACCM = _mm_mullo_epi16(vs, vte); + ACCH = _mm_mulhi_epi16(vs, vte); + lo = _mm_unpacklo_epi16(ACCM, ACCH); + hi = _mm_unpackhi_epi16(ACCM, ACCH); + vd = _mm_packs_epi32(lo, hi); +#endif + } +} + +template +auto RSP::VMUDL(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, u16(vs.u16(n) * vte.u16(n) >> 16)); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_mulhi_epu16(vs, vt(e)); + ACCM = zero; + ACCH = zero; + vd = ACCL; +#endif + } +} + +template +auto RSP::VMUDM(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, s32(vs.s16(n) * vte.u16(n))); + } + vd = ACCM; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), sign, vta; + ACCL = _mm_mullo_epi16(vs, vte); + ACCM = _mm_mulhi_epu16(vs, vte); + sign = _mm_srai_epi16(vs, 15); + vta = _mm_and_si128(vte, sign); + ACCM = _mm_sub_epi16(ACCM, vta); + ACCH = _mm_srai_epi16(ACCM, 15); + vd = ACCM; +#endif + } +} + +template +auto RSP::VMUDN(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, s32(vs.u16(n) * vte.s16(n))); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), sign, vsa; + ACCL = _mm_mullo_epi16(vs, vte); + ACCM = _mm_mulhi_epu16(vs, vte); + sign = _mm_srai_epi16(vte, 15); + vsa = _mm_and_si128(vs, sign); + ACCM = _mm_sub_epi16(ACCM, vsa); + ACCH = _mm_srai_epi16(ACCM, 15); + vd = ACCL; +#endif + } +} + +template +auto RSP::VMULF(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + accumulatorSet(n, (s64)vs.s16(n) * (s64)vte.s16(n) * 2 + 0x8000); + if constexpr (U == 0) { + vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff); + } + if constexpr (U == 1) { + vd.u16(n) = ACCH.s16(n) < 0 ? 0x0000 : (ACCH.s16(n) ^ ACCM.s16(n)) < 0 ? 0xffff : ACCM.u16(n); + } + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), lo, hi, round, sign1, sign2, neq, eq, neg; + lo = _mm_mullo_epi16(vs, vte); + round = _mm_cmpeq_epi16(zero, zero); + sign1 = _mm_srli_epi16(lo, 15); + lo = _mm_add_epi16(lo, lo); + round = _mm_slli_epi16(round, 15); + hi = _mm_mulhi_epi16(vs, vte); + sign2 = _mm_srli_epi16(lo, 15); + ACCL = _mm_add_epi16(round, lo); + sign1 = _mm_add_epi16(sign1, sign2); + hi = _mm_slli_epi16(hi, 1); + neq = _mm_cmpeq_epi16(vs, vte); + ACCM = _mm_add_epi16(hi, sign1); + neg = _mm_srai_epi16(ACCM, 15); + if constexpr (!U) { + eq = _mm_and_si128(neq, neg); + ACCH = _mm_andnot_si128(neq, neg); + vd = _mm_add_epi16(ACCM, eq); + } else { + ACCH = _mm_andnot_si128(neq, neg); + hi = _mm_or_si128(ACCM, neg); + vd = _mm_andnot_si128(ACCH, hi); + } +#endif + } +} + +template +auto RSP::VMULQ(r128& vd, cr128& vs, cr128& vt) -> void { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + s32 product = (s16)vs.element(n) * (s16)vte.element(n); + if (product < 0) product += 31; //round + ACCH.element(n) = product >> 16; + ACCM.element(n) = product >> 0; + ACCL.element(n) = 0; + vd.element(n) = sclamp<16>(product >> 1) & ~15; + } +} + +template +auto RSP::VNAND(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = ~(vs.u16(n) & vte.u16(n)); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_and_si128(vs, vt(e)); + ACCL = _mm_xor_si128(ACCL, invert); + vd = ACCL; +#endif + } +} + +template +auto RSP::VNE(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = VCCL.set(n, vs.u16(n) != vte.u16(n) || VCOH.get(n)) ? vs.u16(n) : vte.u16(n); + } + VCCH = zero; //unverified + VCOL = zero; + VCOH = zero; + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), eq, ne; + eq = _mm_cmpeq_epi16(vs, vte); + ne = _mm_cmpeq_epi16(eq, zero); + VCCL = _mm_and_si128(VCOH, eq); + VCCL = _mm_or_si128(VCCL, ne); + ACCL = _mm_blendv_epi8(vte, vs, VCCL); + VCCH = zero; + VCOH = zero; + VCOL = zero; + vd = ACCL; +#endif + } +} + +auto RSP::VNOP() -> void { +} + +template +auto RSP::VNOR(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = ~(vs.u16(n) | vte.u16(n)); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_or_si128(vs, vt(e)); + ACCL = _mm_xor_si128(ACCL, invert); + vd = ACCL; +#endif + } +} + +template +auto RSP::VNXOR(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = ~(vs.u16(n) ^ vte.u16(n)); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_xor_si128(vs, vt(e)); + ACCL = _mm_xor_si128(ACCL, invert); + vd = ACCL; +#endif + } +} + +template +auto RSP::VOR(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = vs.u16(n) | vte.u16(n); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_or_si128(vs, vt(e)); + vd = ACCL; +#endif + } +} + +template +auto RSP::VRCP(r128& vd, u8 de, cr128& vt) -> void { + s32 result = 0; + s32 input = L && DIVDP ? DIVIN << 16 | vt.element(e & 7) : s16(vt.element(e & 7)); + s32 mask = input >> 31; + s32 data = input ^ mask; + if (input > -32768) data -= mask; + if (data == 0) { + result = 0x7fff'ffff; + } else if (input == -32768) { + result = 0xffff'0000; + } else { + u32 shift = __builtin_clz(data); + u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22; + result = rspReciprocals[index]; + result = (0x10000 | result) << 14; + result = result >> 31 - shift ^ mask; + } + DIVDP = 0; + DIVOUT = result >> 16; + ACCL = vt(e); + vd.element(de) = result; +} + +template +auto RSP::VRCPH(r128& vd, u8 de, cr128& vt) -> void { + ACCL = vt(e); + DIVDP = 1; + DIVIN = vt.element(e & 7); + vd.element(de) = DIVOUT; +} + +template +auto RSP::VRND(r128& vd, u8 vs, cr128& vt) -> void { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + s32 product = (s16)vte.element(n); + if (vs & 1) product <<= 16; + s64 acc = 0; + acc |= ACCH.element(n); acc <<= 16; + acc |= ACCM.element(n); acc <<= 16; + acc |= ACCL.element(n); acc <<= 16; + acc >>= 16; + if (D == 0 && acc < 0) acc = sclip<48>(acc + product); + if (D == 1 && acc >= 0) acc = sclip<48>(acc + product); + ACCH.element(n) = acc >> 32; + ACCM.element(n) = acc >> 16; + ACCL.element(n) = acc >> 0; + vd.element(n) = sclamp<16>(acc >> 16); + } +} + +template +auto RSP::VRSQ(r128& vd, u8 de, cr128& vt) -> void { + s32 result = 0; + s32 input = L && DIVDP ? DIVIN << 16 | vt.element(e & 7) : s16(vt.element(e & 7)); + s32 mask = input >> 31; + s32 data = input ^ mask; + if (input > -32768) data -= mask; + if (data == 0) { + result = 0x7fff'ffff; + } else if (input == -32768) { + result = 0xffff'0000; + } else { + u32 shift = __builtin_clz(data); + u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22; + result = rspInverseSquareRoots[index & 0x1fe | shift & 1]; + result = (0x10000 | result) << 14; + result = result >> (31 - shift >> 1) ^ mask; + } + DIVDP = 0; + DIVOUT = result >> 16; + ACCL = vt(e); + vd.element(de) = result; +} + +template +auto RSP::VRSQH(r128& vd, u8 de, cr128& vt) -> void { + ACCL = vt(e); + DIVDP = 1; + DIVIN = vt.element(e & 7); + vd.element(de) = DIVOUT; +} + +template +auto RSP::VSAR(r128& vd, cr128& vs) -> void { + switch (e) { + case 0x8: vd = ACCH; break; + case 0x9: vd = ACCM; break; + case 0xa: vd = ACCL; break; + default: vd = zero; break; + } +} + +template +auto RSP::VSUB(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + s32 result = vs.s16(n) - vte.s16(n) - VCOL.get(n); + ACCL.s16(n) = result; + vd.s16(n) = sclamp<16>(result); + } + VCOL = zero; + VCOH = zero; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), udiff, sdiff, ov; + udiff = _mm_sub_epi16(vte, VCOL); + sdiff = _mm_subs_epi16(vte, VCOL); + ACCL = _mm_sub_epi16(vs, udiff); + ov = _mm_cmpgt_epi16(sdiff, udiff); + vd = _mm_subs_epi16(vs, sdiff); + vd = _mm_adds_epi16(vd, ov); + VCOL = zero; + VCOH = zero; +#endif + } +} + +template +auto RSP::VSUBC(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + u32 result = vs.u16(n) - vte.u16(n); + ACCL.u16(n) = result; + VCOL.set(n, result >> 16); + VCOH.set(n, result != 0); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), equal, udiff, diff0; + udiff = _mm_subs_epu16(vs, vte); + equal = _mm_cmpeq_epi16(vs, vte); + diff0 = _mm_cmpeq_epi16(udiff, zero); + VCOH = _mm_cmpeq_epi16(equal, zero); + VCOL = _mm_andnot_si128(equal, diff0); + ACCL = _mm_sub_epi16(vs, vte); + vd = ACCL; +#endif + } +} + +template +auto RSP::VXOR(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + ACCL.u16(n) = vs.u16(n) ^ vte.u16(n); + } + vd = ACCL; + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + ACCL = _mm_xor_si128(vs, vt(e)); + vd = ACCL; +#endif + } +} + +template +auto RSP::VZERO(r128& vd, cr128& vs, cr128& vt) -> void { + if constexpr (Accuracy::RSP::SISD) { + cr128 vte = vt(e); + for (u32 n = 0; n < 8; n++) { + s32 result = vs.s16(n) + vte.s16(n); + ACCL.s16(n) = result; + vd.s16(n) = 0; + } + } + + if constexpr (Accuracy::RSP::SIMD) { +#if ARCHITECTURE_SUPPORTS_SSE4_1 + r128 vte = vt(e), sum, min, max; + ACCL = _mm_add_epi16(vs, vte); + vd = _mm_xor_si128(vd, vd); +#endif + } +} + +#undef ACCH +#undef ACCM +#undef ACCL +#undef VCOH +#undef VCOL +#undef VCCH +#undef VCCL +#undef VCE + +#undef DIVIN +#undef DIVOUT +#undef DIVDP