diff --git a/RSPRecomp/src/rsp_recomp.cpp b/RSPRecomp/src/rsp_recomp.cpp
index fe04dfc..e6dfdb3 100644
--- a/RSPRecomp/src/rsp_recomp.cpp
+++ b/RSPRecomp/src/rsp_recomp.cpp
@@ -1,33 +1,583 @@
+#include <optional>
+#include <fstream>
+#include <array>
+#include <unordered_set>
+#include <unordered_map>
+#include <cassert>
 #include "rabbitizer.hpp"
 #include "fmt/format.h"
 #include "fmt/ostream.h"
 
-int main() {
-	//rabbitizer::InstructionRsp instr{ 0xE9DD3801, 0x040013E0 }; // suv $v29[0], 0x8($14)
-	rabbitizer::InstructionRsp instr{ 0xEAF70B84, 0x04001624 }; // ssv $v23[7], 0x8($23)
-	//rabbitizer::InstructionRsp instr{ 0x4B5E888F, 0x04001414 }; // vmadh $v2, $v17, $v30[2]
-	bool has_element = false;
-	int element = 0;
+using InstrId = rabbitizer::InstrId::UniqueId;
+using Cop0Reg = rabbitizer::Registers::Rsp::Cop0;
+constexpr size_t instr_size = sizeof(uint32_t);
 
-	fmt::print("{}\n", instr.disassemble(0));
-	fmt::print("{}\n", instr.getOpcodeName());
-	fmt::print("{}\n", instr.disassembleOperands());
+// Can't use rabbitizer's operand types because we need to be able to provide a register reference or a register index
+enum class RspOperand {
+    None,
+    Vt,
+    VtIndex,
+    Vd,
+    Vs,
+    VsIndex,
+    De,
+    Rt,
+    Rs,
+    Imm7,
+};
 
-	if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementhigh)) {
-		element = instr.GetRsp_elementhigh();
-		has_element = true;
-	} else if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementlow)) {
-		if (has_element) {
-			fmt::print(stderr, "Instruction cannot have two element values {}\n", instr.disassemble(0));
-			std::exit(EXIT_FAILURE);
-		}
-		element = instr.GetRsp_elementlow();
-		has_element = true;
-	}
+std::unordered_map<InstrId, std::array<RspOperand, 3>> vector_operands{
+    // Vt, Rs, Imm
+    { InstrId::rsp_lbv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_ldv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_lfv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_lhv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_llv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_lpv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_lqv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_lrv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_lsv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_luv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    // { InstrId::rsp_lwv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}}, // Not in rabbitizer
+    { InstrId::rsp_sbv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_sdv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_sfv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_shv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_slv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_spv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_sqv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_srv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_ssv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_suv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_swv, {RspOperand::Vt, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_stv, {RspOperand::VtIndex, RspOperand::Rs, RspOperand::Imm7}},
+    { InstrId::rsp_ltv, {RspOperand::VtIndex, RspOperand::Rs, RspOperand::Imm7}},
 
-	if (has_element) {
-		fmt::print("element: 0x{:X}\n", element);
-	}
+    // Vd, Vs, Vt
+    { InstrId::rsp_vabs,    {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vadd,    {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vaddc,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vand,    {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vch,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vcl,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vcr,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_veq,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vge,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vlt,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmacf,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmacu,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmadh,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmadl,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmadm,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmadn,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmrg,    {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmudh,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmudl,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmudm,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmudn,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vne,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vnor,    {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vnxor,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vor,     {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vsub,    {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vsubc,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmulf,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmulu,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vmulq,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vnand,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vxor,    {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}},
+    { InstrId::rsp_vsar,    {RspOperand::Vd, RspOperand::Vs, RspOperand::None}},
+    { InstrId::rsp_vmacq,   {RspOperand::Vd, RspOperand::None, RspOperand::None}},
+    // { InstrId::rsp_vzero,   {RspOperand::Vd, RspOperand::Vs, RspOperand::Vt}}, unused pseudo
+    { InstrId::rsp_vrndn,   {RspOperand::Vd, RspOperand::VsIndex, RspOperand::Vt}},
+    { InstrId::rsp_vrndp,   {RspOperand::Vd, RspOperand::VsIndex, RspOperand::Vt}},
 
-	return 0;
+    // Vd, De, Vt
+    { InstrId::rsp_vmov,    {RspOperand::Vd, RspOperand::De, RspOperand::Vt}},
+    { InstrId::rsp_vrcp,    {RspOperand::Vd, RspOperand::De, RspOperand::Vt}},
+    { InstrId::rsp_vrcpl,   {RspOperand::Vd, RspOperand::De, RspOperand::Vt}},
+    { InstrId::rsp_vrcph,   {RspOperand::Vd, RspOperand::De, RspOperand::Vt}},
+    { InstrId::rsp_vrsq,    {RspOperand::Vd, RspOperand::De, RspOperand::Vt}},
+    { InstrId::rsp_vrsql,   {RspOperand::Vd, RspOperand::De, RspOperand::Vt}},
+    { InstrId::rsp_vrsqh,   {RspOperand::Vd, RspOperand::De, RspOperand::Vt}},
+
+    // Rt, Vs
+    { InstrId::rsp_mfc2,    {RspOperand::Rt, RspOperand::Vs, RspOperand::None}},
+    { InstrId::rsp_mtc2,    {RspOperand::Rt, RspOperand::Vs, RspOperand::None}},
+
+    // Nop
+    { InstrId::rsp_vnop,    {RspOperand::None, RspOperand::None, RspOperand::None}}
+};
+
+std::string_view ctx_gpr_prefix(int reg) {
+    if (reg != 0) {
+        return "r";
+    }
+    return "";
+}
+
+uint32_t expected_c0_reg_value(int cop0_reg) {
+    switch (static_cast<Cop0Reg>(cop0_reg)) {
+    case Cop0Reg::RSP_COP0_SP_STATUS:
+        return 0; // None of the flags in RSP status are set
+    case Cop0Reg::RSP_COP0_SP_DMA_FULL:
+        return 0; // Pretend DMAs complete instantly
+    case Cop0Reg::RSP_COP0_SP_DMA_BUSY:
+        return 0; // Pretend DMAs complete instantly
+    case Cop0Reg::RSP_COP0_SP_SEMAPHORE:
+        return 0; // Always acquire the semaphore
+    }
+    fmt::print(stderr, "Unhandled mfc0: {}\n", cop0_reg);
+    assert(false);
+    return 0;
+}
+
+std::string_view c0_reg_write_action(int cop0_reg) {
+    switch (static_cast<Cop0Reg>(cop0_reg)) {
+    case Cop0Reg::RSP_COP0_SP_SEMAPHORE:
+        return ""; // Ignore semaphore functionality
+    case Cop0Reg::RSP_COP0_SP_STATUS:
+        return ""; // Ignore writes to the status flags since yielding is ignored
+    case Cop0Reg::RSP_COP0_SP_DRAM_ADDR:
+        return "SET_DMA_DRAM";
+    case Cop0Reg::RSP_COP0_SP_MEM_ADDR:
+        return "SET_DMA_DMEM";
+    case Cop0Reg::RSP_COP0_SP_RD_LEN:
+        return "DO_DMA_READ";
+    case Cop0Reg::RSP_COP0_SP_WR_LEN:
+        return "DO_DMA_WRITE";
+    }
+    fmt::print(stderr, "Unhandled mtc0: {}\n", cop0_reg);
+    assert(false);
+    return "";
+}
+
+std::optional<int> get_rsp_element(const rabbitizer::InstructionRsp& instr) {
+    if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementhigh)) {
+        return instr.GetRsp_elementhigh();
+    } else if (instr.hasOperand(rabbitizer::OperandType::rsp_vt_elementlow) || instr.hasOperand(rabbitizer::OperandType::rsp_vs_index)) {
+        return instr.GetRsp_elementlow();
+    }
+
+    return std::nullopt;
+}
+
+bool rsp_ignores_element(InstrId id) {
+    return id == InstrId::rsp_vmacq || id == InstrId::rsp_vnop;
+}
+
+struct BranchTargets {
+    std::unordered_set<uint32_t> direct_targets;
+    std::unordered_set<uint32_t> indirect_targets;
+};
+
+BranchTargets get_branch_targets(const std::vector<rabbitizer::InstructionRsp>& instrs) {
+    BranchTargets ret;
+    for (const auto& instr : instrs) {
+        if (instr.isJumpWithAddress() || instr.isBranch()) {
+            ret.direct_targets.insert(instr.getBranchVramGeneric());
+        }
+        if (instr.doesLink()) {
+            ret.indirect_targets.insert(instr.getVram() + 2 * instr_size);
+        }
+    }
+    return ret;
+}
+
+bool process_instruction(size_t instr_index, const std::vector<rabbitizer::InstructionRsp>& instructions, std::ofstream& output_file, const BranchTargets& branch_targets, bool indent) {
+    const auto& instr = instructions[instr_index];
+
+    uint32_t instr_vram = instr.getVram();
+    InstrId instr_id = instr.getUniqueId();
+
+    // Print a label if one exists here
+    if (branch_targets.direct_targets.contains(instr_vram) || branch_targets.indirect_targets.contains(instr_vram)) {
+        fmt::print(output_file, "L_{:08X}:\n", instr_vram);
+    }
+
+    // Output a comment with the original instruction
+    if (instr.isBranch() || instr_id == InstrId::rsp_j) {
+        fmt::print(output_file, "    // {}\n", instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric())));
+    } else if (instr_id == InstrId::rsp_jal) {
+        fmt::print(output_file, "    // {}\n", instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric())));
+    } else {
+        fmt::print(output_file, "    // {}\n", instr.disassemble(0));
+    }
+
+    auto print_indent = [&]() {
+        fmt::print(output_file, "    ");
+    };
+
+    auto print_line = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
+        print_indent();
+        fmt::print(output_file, fmt_str, args...);
+        fmt::print(output_file, ";\n");
+    };
+
+    auto print_branch_condition = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
+        fmt::print(output_file, fmt_str, args...);
+        fmt::print(output_file, " ");
+    };
+
+    auto print_unconditional_branch = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
+        if (instr_index < instructions.size() - 1) {
+            uint32_t next_vram = instr_vram + 4;
+            process_instruction(instr_index + 1, instructions, output_file, branch_targets, false);
+        }
+        print_indent();
+        fmt::print(output_file, fmt_str, args...);
+        fmt::print(output_file, ";\n");
+    };
+
+    auto print_branch = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
+        fmt::print(output_file, "{{\n    ");
+        if (instr_index < instructions.size() - 1) {
+            uint32_t next_vram = instr_vram + 4;
+            process_instruction(instr_index + 1, instructions, output_file, branch_targets, true);
+        }
+        fmt::print(output_file, "        ");
+        fmt::print(output_file, fmt_str, args...);
+        fmt::print(output_file, ";\n    }}\n");
+    };
+
+    if (indent) {
+        print_indent();
+    }
+
+    int rd = (int)instr.GetO32_rd();
+    int rs = (int)instr.GetO32_rs();
+    int base = rs;
+    int rt = (int)instr.GetO32_rt();
+    int sa = (int)instr.Get_sa();
+
+    int fd = (int)instr.GetO32_fd();
+    int fs = (int)instr.GetO32_fs();
+    int ft = (int)instr.GetO32_ft();
+
+    uint16_t imm = instr.Get_immediate();
+
+    std::string unsigned_imm_string = fmt::format("{:#X}", imm);
+    std::string signed_imm_string = fmt::format("{:#X}", (int16_t)imm);
+
+    auto rsp_element = get_rsp_element(instr);
+
+    // If this instruction is in the vector operand table then emit the appropriate function call for its implementation
+    auto operand_find_it = vector_operands.find(instr_id);
+    if (operand_find_it != vector_operands.end()) {
+        const auto& operands = operand_find_it->second;
+        int vd = (int)instr.GetRsp_vd();
+        int vs = (int)instr.GetRsp_vs();
+        int vt = (int)instr.GetRsp_vt();
+        std::string operand_string = "";
+        for (RspOperand operand : operands) {
+            switch (operand) {
+                case RspOperand::Vt:
+                    operand_string += fmt::format("rsp.vpu.r[{}], ", vt);
+                    break;
+                case RspOperand::VtIndex:
+                    operand_string += fmt::format("{}, ", vt);
+                    break;
+                case RspOperand::Vd:
+                    operand_string += fmt::format("rsp.vpu.r[{}], ", vd);
+                    break;
+                case RspOperand::Vs:
+                    operand_string += fmt::format("rsp.vpu.r[{}], ", vs);
+                    break;
+                case RspOperand::VsIndex:
+                    operand_string += fmt::format("{}, ", vs);
+                    break;
+                case RspOperand::De:
+                    operand_string += fmt::format("{}, ", instr.GetRsp_de());
+                    break;
+                case RspOperand::Rt:
+                    operand_string += fmt::format("{}{}, ", ctx_gpr_prefix(rt), rt);
+                    break;
+                case RspOperand::Rs:
+                    operand_string += fmt::format("{}{}, ", ctx_gpr_prefix(rs), rs);
+                    break;
+                case RspOperand::Imm7:
+                    // Sign extend the 7-bit immediate
+                    operand_string += fmt::format("{:#X}, ", ((int8_t)(imm << 1)) >> 1);
+                    break;
+            }
+        }
+        // Trim the trailing comma off the operands
+        if (operand_string.size() > 0) {
+            operand_string = operand_string.substr(0, operand_string.size() - 2);
+        }
+        std::string uppercase_name = "";
+        std::string lowercase_name = instr.getOpcodeName();
+        uppercase_name.reserve(lowercase_name.size() + 1);
+        for (char c : lowercase_name) {
+            uppercase_name += std::toupper(c);
+        }
+        if (rsp_ignores_element(instr_id)) {
+            print_line("rsp.{}({})", uppercase_name, operand_string);
+        } else {
+            print_line("rsp.{}<{}>({})", uppercase_name, rsp_element.value(), operand_string);
+        }
+    }
+    // Otherwise, implement the instruction directly
+    else {
+        switch (instr_id) {
+        case InstrId::rsp_nop:
+            fmt::print(output_file, "\n");
+            break;
+            // Arithmetic
+        case InstrId::rsp_lui:
+            print_line("{}{} = S32({} << 16)", ctx_gpr_prefix(rt), rt, unsigned_imm_string);
+            break;
+        case InstrId::rsp_add:
+        case InstrId::rsp_addu:
+            print_line("{}{} = RSP_ADD32({}{}, {}{})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_negu: // pseudo instruction for subu x, 0, y
+        case InstrId::rsp_sub:
+        case InstrId::rsp_subu:
+            print_line("{}{} = RSP_SUB32({}{}, {}{})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_addi:
+        case InstrId::rsp_addiu:
+            print_line("{}{} = RSP_ADD32({}{}, {})", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, signed_imm_string);
+            break;
+        case InstrId::rsp_and:
+            print_line("{}{} = {}{} & {}{}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_andi:
+            print_line("{}{} = {}{} & {}", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, unsigned_imm_string);
+            break;
+        case InstrId::rsp_or:
+            print_line("{}{} = {}{} | {}{}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_ori:
+            print_line("{}{} = {}{} | {}", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, unsigned_imm_string);
+            break;
+        case InstrId::rsp_nor:
+            print_line("{}{} = ~({}{} | {}{})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_xor:
+            print_line("{}{} = {}{} ^ {}{}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_xori:
+            print_line("{}{} = {}{} ^ {}", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, unsigned_imm_string);
+            break;
+        case InstrId::rsp_sll:
+            print_line("{}{} = S32({}{}) << {}", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, sa);
+            break;
+        case InstrId::rsp_sllv:
+            print_line("{}{} = S32({}{}) << ({}{} & 31)", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs);
+            break;
+        case InstrId::rsp_sra:
+            print_line("{}{} = S32(RSP_SIGNED({}{}) >> {})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, sa);
+            break;
+        case InstrId::rsp_srav:
+            print_line("{}{} = S32(RSP_SIGNED({}{}) >> ({}{} & 31))", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs);
+            break;
+        case InstrId::rsp_srl:
+            print_line("{}{} = S32(U32({}{}) >> {})", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, sa);
+            break;
+        case InstrId::rsp_srlv:
+            print_line("{}{} = S32(U32({}{}) >> ({}{} & 31))", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs);
+            break;
+        case InstrId::rsp_slt:
+            print_line("{}{} = RSP_SIGNED({}{}) < RSP_SIGNED({}{}) ? 1 : 0", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_slti:
+            print_line("{}{} = RSP_SIGNED({}{}) < {} ? 1 : 0", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, signed_imm_string);
+            break;
+        case InstrId::rsp_sltu:
+            print_line("{}{} = {}{} < {}{} ? 1 : 0", ctx_gpr_prefix(rd), rd, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_sltiu:
+            print_line("{}{} = {}{} < {} ? 1 : 0", ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, signed_imm_string);
+            break;
+            // Loads
+            // TODO ld
+        case InstrId::rsp_lw:
+            print_line("{}{} = RSP_MEM_W({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base);
+            break;
+        case InstrId::rsp_lh:
+            print_line("{}{} = RSP_MEM_H({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base);
+            break;
+        case InstrId::rsp_lb:
+            print_line("{}{} = RSP_MEM_B({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base);
+            break;
+        case InstrId::rsp_lhu:
+            print_line("{}{} = RSP_MEM_HU({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base);
+            break;
+        case InstrId::rsp_lbu:
+            print_line("{}{} = RSP_MEM_BU({}, {}{})", ctx_gpr_prefix(rt), rt, signed_imm_string, ctx_gpr_prefix(base), base);
+            break;
+            // Stores
+        case InstrId::rsp_sw:
+            print_line("RSP_MEM_W({}, {}{}) = {}{}", signed_imm_string, ctx_gpr_prefix(base), base, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_sh:
+            print_line("RSP_MEM_H({}, {}{}) = {}{}", signed_imm_string, ctx_gpr_prefix(base), base, ctx_gpr_prefix(rt), rt);
+            break;
+        case InstrId::rsp_sb:
+            print_line("RSP_MEM_B({}, {}{}) = {}{}", signed_imm_string, ctx_gpr_prefix(base), base, ctx_gpr_prefix(rt), rt);
+            break;
+            // Branches
+        case InstrId::rsp_j:
+        case InstrId::rsp_b:
+            print_unconditional_branch("goto L_{:08X}", instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_jal:
+            print_line("{}{} = 0x{:08X}", ctx_gpr_prefix(31), 31, instr_vram + 2 * instr_size);
+            print_unconditional_branch("goto L_{:08X}", instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_jr:
+            print_line("jump_target = {}{}", ctx_gpr_prefix(rs), rs);
+            print_unconditional_branch("goto do_indirect_jump");
+            break;
+        case InstrId::rsp_jalr:
+            print_line("jump_target = {}{}; {}{} = 0x{:8X}", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rd), rd, instr_vram + 2 * instr_size);
+            print_unconditional_branch("goto do_indirect_jump");
+            break;
+        case InstrId::rsp_bne:
+            print_indent();
+            print_branch_condition("if ({}{} != {}{})", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_beq:
+            print_indent();
+            print_branch_condition("if ({}{} == {}{})", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+            print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_bgez:
+            print_indent();
+            print_branch_condition("if (RSP_SIGNED({}{}) >= 0)", ctx_gpr_prefix(rs), rs);
+            print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_bgtz:
+            print_indent();
+            print_branch_condition("if (RSP_SIGNED({}{}) > 0)", ctx_gpr_prefix(rs), rs);
+            print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_blez:
+            print_indent();
+            print_branch_condition("if (RSP_SIGNED({}{}) <= 0)", ctx_gpr_prefix(rs), rs);
+            print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_bltz:
+            print_indent();
+            print_branch_condition("if (RSP_SIGNED({}{}) < 0)", ctx_gpr_prefix(rs), rs);
+            print_branch("goto L_{:08X}", (uint32_t)instr.getBranchVramGeneric());
+            break;
+        case InstrId::rsp_break:
+            print_line("return RspExitReason::Broke", instr_vram);
+            break;
+        case InstrId::rsp_mfc0:
+            print_line("{}{} = {}", ctx_gpr_prefix(rt), rt, expected_c0_reg_value(rd));
+            break;
+        case InstrId::rsp_mtc0:
+            {
+                std::string_view write_action = c0_reg_write_action(rd);
+                if (!write_action.empty()) {
+                    print_line("{}({}{})", write_action, ctx_gpr_prefix(rt), rt); \
+                }
+                break;
+            }
+        default:
+            fmt::print(stderr, "Unhandled instruction: {}\n", instr.getOpcodeName());
+            assert(false);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void write_indirect_jumps(std::ofstream& output_file, const BranchTargets& branch_targets) {
+    fmt::print(output_file,
+        "do_indirect_jump:\n"
+        "    switch (jump_target) {{ \n");
+    for (uint32_t branch_target: branch_targets.indirect_targets) {
+        fmt::print(output_file, "        case 0x{0:08X}: goto L_{0:08X};\n", branch_target);
+    }
+    fmt::print(output_file,
+        "    }}\n"
+        "    return RspExitReason::UnhandledJumpTarget;\n");
+}
+
+// TODO de-hardcode these
+constexpr size_t rsp_text_offset = 0xB8BAD0;
+constexpr size_t rsp_text_size = 0xAF0;
+constexpr size_t rsp_text_address = 0x04001080;
+std::string rom_file_path = "../test/oot_mq_debug.z64";
+std::string output_file_path = "../test/rsp/njpgdspMain.cpp";
+std::string output_function_name = "njpgdspMain";
+
+#ifdef _MSC_VER
+inline uint32_t byteswap(uint32_t val) {
+    return _byteswap_ulong(val);
+}
+#else
+constexpr uint32_t byteswap(uint32_t val) {
+    return __builtin_bswap32(val);
+}
+#endif
+
+static_assert((rsp_text_size / instr_size) * instr_size == rsp_text_size, "RSP microcode must be a multiple of the instruction size");
+
+int main() {
+    std::array<uint32_t, rsp_text_size / sizeof(uint32_t)> instr_words{};
+    {
+        std::ifstream rom_file{ rom_file_path, std::ios_base::binary };
+
+        if (!rom_file.good()) {
+            fmt::print(stderr, "Failed to open rom file\n");
+            return EXIT_FAILURE;
+        }
+
+        rom_file.seekg(rsp_text_offset);
+        rom_file.read(reinterpret_cast<char*>(instr_words.data()), rsp_text_size);
+    }
+
+    // Disable appropriate pseudo instructions
+    RabbitizerConfig_Cfg.pseudos.pseudoMove = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBeqz = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBnez = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoNot = false;
+
+    // Decode the instruction words into instructions
+    std::vector<rabbitizer::InstructionRsp> instrs{};
+    instrs.reserve(instr_words.size());
+    uint32_t vram = rsp_text_address;
+    for (uint32_t instr_word : instr_words) {
+        const rabbitizer::InstructionRsp& instr = instrs.emplace_back(byteswap(instr_word), vram);
+        vram += instr_size;
+    }
+
+    // Collect indirect jump targets (return addresses for linked jumps)
+    BranchTargets branch_targets = get_branch_targets(instrs);
+
+    // Open output file and write beginning
+    std::ofstream output_file(output_file_path);
+    fmt::print(output_file,
+        "#include \"../src/rsp.h\"\n"
+        "#include \"../src/rsp_vu_impl.h\"\n"
+        "RspExitReason {}(uint8_t* rdram) {{\n"
+        "    uint32_t           r1 = 0,  r2 = 0,  r3 = 0,  r4 = 0,  r5 = 0,  r6 = 0,  r7 = 0;\n"
+        "    uint32_t  r8 = 0,  r9 = 0, r10 = 0, r11 = 0, r12 = 0, r13 = 0, r14 = 0, r15 = 0;\n"
+        "    uint32_t r16 = 0, r17 = 0, r18 = 0, r19 = 0, r20 = 0, r21 = 0, r22 = 0, r23 = 0;\n"
+        "    uint32_t r24 = 0, r25 = 0, r26 = 0, r27 = 0, r28 = 0, r29 = 0, r30 = 0, r31 = 0;\n"
+        "    uint32_t dma_dmem_address = 0, dma_dram_address = 0, jump_target = 0;\n"
+        "    RSP rsp{{}};\n"
+        "    r1 = 0xFC0;\n", output_function_name);
+    // Write each instruction
+    for (size_t instr_index = 0; instr_index < instrs.size(); instr_index++) {
+        process_instruction(instr_index, instrs, output_file, branch_targets, false);
+    }
+
+    // Terminate instruction code with a return to indicate that the microcode has run past its end
+    fmt::print(output_file, "    return RspExitReason::ImemOverrun;\n");
+
+    // Write the section containing the indirect jump table
+    write_indirect_jumps(output_file, branch_targets);
+
+    // End the file
+    fmt::print(output_file, "}}\n");
+    return 0;
 }
diff --git a/lib/rabbitizer b/lib/rabbitizer
index 54f9976..b9a39f6 160000
--- a/lib/rabbitizer
+++ b/lib/rabbitizer
@@ -1 +1 @@
-Subproject commit 54f997607c62d8c1c5316ef414adf17f5c060797
+Subproject commit b9a39f6ec0a3ff6690ef2925e6275cf6578602cc
diff --git a/src/recompilation.cpp b/src/recompilation.cpp
index 95c943b..0cb5bba 100644
--- a/src/recompilation.cpp
+++ b/src/recompilation.cpp
@@ -960,6 +960,7 @@ bool process_instruction(const RecompPort::Context& context, const RecompPort::F
         return false;
     }
 
+    // TODO is this used?
     if (emit_link_branch) {
         fmt::print(output_file, "    after_{}:\n", link_branch_index);
     }
diff --git a/test/RecompTest.vcxproj b/test/RecompTest.vcxproj
index cdb109e..9dee895 100644
--- a/test/RecompTest.vcxproj
+++ b/test/RecompTest.vcxproj
@@ -162,6 +162,7 @@ XCOPY "$(ProjectDir)Lib\SDL2-2.24.0\lib\$(Platform)\SDL2.dll" "$(TargetDir)" /S
     <ClCompile Include="portultra\task_pthreads.cpp" />
     <ClCompile Include="portultra\task_win32.cpp" />
     <ClCompile Include="portultra\threads.cpp" />
+    <ClCompile Include="rsp\njpgdspMain.cpp" />
     <ClCompile Include="RT64\rt64_layer.cpp" />
     <ClCompile Include="src\ai.cpp" />
     <ClCompile Include="src\cont.cpp" />
diff --git a/test/RecompTest.vcxproj.filters b/test/RecompTest.vcxproj.filters
index 4e7083b..7f0a312 100644
--- a/test/RecompTest.vcxproj.filters
+++ b/test/RecompTest.vcxproj.filters
@@ -30234,6 +30234,9 @@
     <ClCompile Include="funcs\lookup.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="rsp\njpgdspMain.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="portultra\platform_specific.h">
diff --git a/test/portultra/events.cpp b/test/portultra/events.cpp
index be7a413..e57af26 100644
--- a/test/portultra/events.cpp
+++ b/test/portultra/events.cpp
@@ -15,6 +15,7 @@
 #include "ultra64.h"
 #include "multilibultra.hpp"
 #include "recomp.h"
+#include "../src/rsp.h"
 
 struct SpTaskAction {
     OSTask task;
@@ -203,6 +204,44 @@ int sdl_event_filter(void* userdata, SDL_Event* event) {
     return 1;
 }
 
+uint8_t dmem[0x1000];
+uint16_t rspReciprocals[512];
+uint16_t rspInverseSquareRoots[512];
+
+using RspUcodeFunc = RspExitReason(uint8_t* rdram);
+extern RspUcodeFunc njpgdspMain;
+
+// From Ares emulator. For license details, see rsp_vu.h
+void rsp_constants_init() {
+    rspReciprocals[0] = u16(~0);
+    for (u16 index = 1; index < 512; index++) {
+        u64 a = index + 512;
+        u64 b = (u64(1) << 34) / a;
+        rspReciprocals[index] = u16(b + 1 >> 8);
+    }
+
+    for (u16 index = 0; index < 512; index++) {
+        u64 a = index + 512 >> ((index % 2 == 1) ? 1 : 0);
+        u64 b = 1 << 17;
+        //find the largest b where b < 1.0 / sqrt(a)
+        while (a * (b + 1) * (b + 1) < (u64(1) << 44)) b++;
+        rspInverseSquareRoots[index] = u16(b >> 1);
+    }
+}
+
+// Runs a recompiled RSP microcode
+void run_rsp_microcode(uint8_t* rdram, const OSTask* task, RspUcodeFunc* ucode_func) {
+    // Load the OSTask into DMEM
+    memcpy(&dmem[0xFC0], task, sizeof(OSTask));
+    // Load the ucode data into DMEM
+    dma_rdram_to_dmem(rdram, 0x0000, task->t.ucode_data, 0xF80 - 1);
+    // Run the ucode
+    RspExitReason exit_reason = ucode_func(rdram);
+    // Ensure that the ucode exited correctly
+    assert(exit_reason == RspExitReason::Broke);
+    sp_complete();
+}
+
 void event_thread_func(uint8_t* rdram, uint8_t* rom) {
     using namespace std::chrono_literals;
     if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_JOYSTICK) < 0) {
@@ -216,6 +255,8 @@ void event_thread_func(uint8_t* rdram, uint8_t* rom) {
     SDL_SetWindowTitle(window, "Recomp");
     //SDL_SetEventFilter(sdl_event_filter, nullptr);
 
+    rsp_constants_init();
+
     while (true) {
         // Try to pull an action from the queue
         Action action;
@@ -230,20 +271,7 @@ void event_thread_func(uint8_t* rdram, uint8_t* rom) {
                 } else if (task_action->task.t.type == M_AUDTASK) {
                     sp_complete();
                 } else if (task_action->task.t.type == M_NJPEGTASK) {
-                    uint32_t* jpeg_task = TO_PTR(uint32_t, (int32_t)(0x80000000 | task_action->task.t.data_ptr));
-                    int32_t address = jpeg_task[0] | 0x80000000;
-                    size_t mbCount = jpeg_task[1];
-                    uint32_t mode    = jpeg_task[2];
-                    //int32_t qTableYPtr = jpeg_task[3] | 0x80000000;
-                    //int32_t qTableUPtr = jpeg_task[4] | 0x80000000;
-                    //int32_t qTableVPtr = jpeg_task[5] | 0x80000000;
-                    //uint32_t mbSize = jpeg_task[6];
-                    if (mode == 0) {
-                        memset(TO_PTR(void, address), 0, mbCount * 0x40 * sizeof(uint16_t) * 4);
-                    } else {
-                        memset(TO_PTR(void, address), 0, mbCount * 0x40 * sizeof(uint16_t) * 6);
-                    }
-                    sp_complete();
+                    run_rsp_microcode(rdram, &task_action->task, njpgdspMain);
                 } else {
                     fprintf(stderr, "Unknown task type: %" PRIu32 "\n", task_action->task.t.type);
                     assert(false);
diff --git a/test/rsp/.gitignore b/test/rsp/.gitignore
new file mode 100644
index 0000000..06fee25
--- /dev/null
+++ b/test/rsp/.gitignore
@@ -0,0 +1 @@
+njpgdspMain.cpp
diff --git a/test/src/rsp.h b/test/src/rsp.h
new file mode 100644
index 0000000..a5e53fb
--- /dev/null
+++ b/test/src/rsp.h
@@ -0,0 +1,65 @@
+#ifndef __RSP_H__
+#define __RSP_H__
+
+#include "rsp_vu.h"
+#include "recomp.h"
+
+enum class RspExitReason {
+    Invalid,
+    Broke,
+    ImemOverrun,
+    UnhandledJumpTarget
+};
+
+extern uint8_t dmem[];
+extern uint16_t rspReciprocals[512];
+extern uint16_t rspInverseSquareRoots[512];
+
+#define RSP_MEM_W(offset, addr) \
+    (*reinterpret_cast<uint32_t*>(dmem + (offset) + (addr)))
+
+#define RSP_MEM_H(offset, addr) \
+    (*reinterpret_cast<int16_t*>(dmem + (((offset) + (addr)) ^ 2)))
+
+#define RSP_MEM_HU(offset, addr) \
+    (*reinterpret_cast<uint16_t*>(dmem + (((offset) + (addr)) ^ 2)))
+
+#define RSP_MEM_B(offset, addr) \
+    (*reinterpret_cast<int8_t*>(dmem + (((offset) + (addr)) ^ 3)))
+
+#define RSP_MEM_BU(offset, addr) \
+    (*reinterpret_cast<uint8_t*>(dmem + (((offset) + (addr)) ^ 3)))
+    
+#define RSP_ADD32(a, b) \
+    ((int32_t)((a) + (b)))
+    
+#define RSP_SUB32(a, b) \
+    ((int32_t)((a) - (b)))
+
+#define RSP_SIGNED(val) \
+    ((int32_t)(val))
+
+#define SET_DMA_DMEM(dmem_addr) dma_dmem_address = (dmem_addr)
+#define SET_DMA_DRAM(dram_addr) dma_dram_address = (dram_addr)
+#define DO_DMA_READ(rd_len) dma_rdram_to_dmem(rdram, dma_dmem_address, dma_dram_address, (rd_len))
+#define DO_DMA_WRITE(wr_len) dma_dmem_to_rdram(rdram, dma_dmem_address, dma_dram_address, (wr_len))
+
+static inline void dma_rdram_to_dmem(uint8_t* rdram, uint32_t dmem_addr, uint32_t dram_addr, uint32_t rd_len) {
+    rd_len += 1; // Read length is inclusive
+    dram_addr &= 0xFFFFF8;
+    assert(dmem_addr + rd_len <= 0x1000);
+    for (uint32_t i = 0; i < rd_len; i++) {
+        RSP_MEM_B(i, dmem_addr) = MEM_B(0, (int64_t)(int32_t)(dram_addr + i + 0x80000000));
+    }
+}
+
+static inline void dma_dmem_to_rdram(uint8_t* rdram, uint32_t dmem_addr, uint32_t dram_addr, uint32_t wr_len) {
+    wr_len += 1; // Write length is inclusive
+    dram_addr &= 0xFFFFF8;
+    assert(dmem_addr + wr_len <= 0x1000);
+    for (uint32_t i = 0; i < wr_len; i++) {
+        MEM_B(0, (int64_t)(int32_t)(dram_addr + i + 0x80000000)) = RSP_MEM_B(i, dmem_addr);
+    }
+}
+
+#endif
diff --git a/test/src/rsp_vu.h b/test/src/rsp_vu.h
new file mode 100644
index 0000000..8ec9b3b
--- /dev/null
+++ b/test/src/rsp_vu.h
@@ -0,0 +1,199 @@
+// This file is modified from the Ares N64 emulator core. Ares can
+// be found at https://github.com/ares-emulator/ares. The original license
+// for this portion of Ares is as follows:
+// ----------------------------------------------------------------------
+// ares
+// 
+// Copyright(c) 2004 - 2021 ares team, Near et al
+// 
+// Permission to use, copy, modify, and /or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright noticeand this permission notice appear in all copies.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS.IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------
+#include <cstdint>
+
+#define ARCHITECTURE_AMD64
+#define ARCHITECTURE_SUPPORTS_SSE4_1 1
+
+#if defined(ARCHITECTURE_AMD64)
+#include <nmmintrin.h>
+using v128 = __m128i;
+#elif defined(ARCHITECTURE_ARM64)
+#include <sse2neon.h>
+using v128 = __m128i;
+#endif
+
+namespace Accuracy {
+    namespace RSP {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        constexpr bool SISD = false;
+        constexpr bool SIMD = true;
+#else
+        constexpr bool SISD = true;
+        constexpr bool SIMD = false;
+#endif
+    }
+}
+
+using u8 = uint8_t;
+using s8 = int8_t;
+using u16 = uint16_t;
+using s16 = int16_t;
+using u32 = uint32_t;
+using s32 = int32_t;
+using u64 = uint64_t;
+using s64 = int64_t;
+using uint128_t = uint64_t[2];
+
+template<u32 bits> inline auto sclamp(s64 x) -> s64 {
+  enum : s64 { b = 1ull << (bits - 1), m = b - 1 };
+  return (x > m) ? m : (x < -b) ? -b : x;
+}
+
+struct RSP {
+    using r32 = uint32_t;
+    using cr32 = const r32;
+
+    union r128 {
+        struct { uint64_t u128[2]; };
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        struct {   __m128i v128; };
+
+        operator __m128i() const { return v128; }
+        auto operator=(__m128i value) { v128 = value; }
+#endif
+
+        auto byte(u32 index) -> uint8_t& { return ((uint8_t*)&u128)[15 - index]; }
+        auto byte(u32 index) const -> uint8_t { return ((uint8_t*)&u128)[15 - index]; }
+
+        auto element(u32 index) -> uint16_t& { return ((uint16_t*)&u128)[7 - index]; }
+        auto element(u32 index) const -> uint16_t { return ((uint16_t*)&u128)[7 - index]; }
+
+        auto u8(u32 index) -> uint8_t& { return ((uint8_t*)&u128)[15 - index]; }
+        auto u8(u32 index) const -> uint8_t { return ((uint8_t*)&u128)[15 - index]; }
+
+        auto s16(u32 index) -> int16_t& { return ((int16_t*)&u128)[7 - index]; }
+        auto s16(u32 index) const -> int16_t { return ((int16_t*)&u128)[7 - index]; }
+
+        auto u16(u32 index) -> uint16_t& { return ((uint16_t*)&u128)[7 - index]; }
+        auto u16(u32 index) const -> uint16_t { return ((uint16_t*)&u128)[7 - index]; }
+
+        //VCx registers
+        auto get(u32 index) const -> bool { return u16(index) != 0; }
+        auto set(u32 index, bool value) -> bool { return u16(index) = 0 - value, value; }
+
+        //vu-registers.cpp
+        auto operator()(u32 index) const -> r128;
+    };
+    using cr128 = const r128;
+
+    struct VU {
+        r128 r[32];
+        r128 acch, accm, accl;
+        r128 vcoh, vcol;  //16-bit little endian
+        r128 vcch, vccl;  //16-bit little endian
+        r128 vce;         // 8-bit little endian
+        s16 divin;
+        s16 divout;
+        bool divdp;
+    } vpu;
+
+    static constexpr r128 zero{0};
+    static constexpr r128 invert{(uint64_t)-1, (uint64_t)-1};
+
+    auto accumulatorGet(u32 index) const -> u64;
+    auto accumulatorSet(u32 index, u64 value) -> void;
+    auto accumulatorSaturate(u32 index, bool slice, u16 negative, u16 positive) const -> u16;
+
+    auto CFC2(r32& rt, u8 rd) -> void;
+    auto CTC2(cr32& rt, u8 rd) -> void;
+    template<u8 e> auto LBV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LDV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LFV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LHV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LLV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LPV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LQV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LRV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LSV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LTV(u8 vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LUV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto LWV(r128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto MFC2(r32& rt, cr128& vs) -> void;
+    template<u8 e> auto MTC2(cr32& rt, r128& vs) -> void;
+    template<u8 e> auto SBV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SDV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SFV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SHV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SLV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SPV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SQV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SRV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SSV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto STV(u8 vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SUV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto SWV(cr128& vt, cr32& rs, s8 imm) -> void;
+    template<u8 e> auto VABS(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VADD(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VADDC(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VAND(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VCH(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VCL(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VCR(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VEQ(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VGE(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VLT(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<bool U, u8 e>
+    auto VMACF(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMACF(r128& vd, cr128& vs, cr128& vt) -> void { VMACF<0, e>(vd, vs, vt); }
+    template<u8 e> auto VMACU(r128& vd, cr128& vs, cr128& vt) -> void { VMACF<1, e>(vd, vs, vt); }
+    auto VMACQ(r128& vd) -> void;
+    template<u8 e> auto VMADH(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMADL(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMADM(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMADN(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMOV(r128& vd, u8 de, cr128& vt) -> void;
+    template<u8 e> auto VMRG(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMUDH(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMUDL(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMUDM(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMUDN(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<bool U, u8 e>
+    auto VMULF(r128& rd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VMULF(r128& rd, cr128& vs, cr128& vt) -> void { VMULF<0, e>(rd, vs, vt); }
+    template<u8 e> auto VMULU(r128& rd, cr128& vs, cr128& vt) -> void { VMULF<1, e>(rd, vs, vt); }
+    template<u8 e> auto VMULQ(r128& rd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VNAND(r128& rd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VNE(r128& vd, cr128& vs, cr128& vt) -> void;
+    auto VNOP() -> void;
+    template<u8 e> auto VNOR(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VNXOR(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VOR(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<bool L, u8 e>
+    auto VRCP(r128& vd, u8 de, cr128& vt) -> void;
+    template<u8 e> auto VRCP(r128& vd, u8 de, cr128& vt) -> void { VRCP<0, e>(vd, de, vt); }
+    template<u8 e> auto VRCPL(r128& vd, u8 de, cr128& vt) -> void { VRCP<1, e>(vd, de, vt); }
+    template<u8 e> auto VRCPH(r128& vd, u8 de, cr128& vt) -> void;
+    template<bool D, u8 e>
+    auto VRND(r128& vd, u8 vs, cr128& vt) -> void;
+    template<u8 e> auto VRNDN(r128& vd, u8 vs, cr128& vt) -> void { VRND<0, e>(vd, vs, vt); }
+    template<u8 e> auto VRNDP(r128& vd, u8 vs, cr128& vt) -> void { VRND<1, e>(vd, vs, vt); }
+    template<bool L, u8 e>
+    auto VRSQ(r128& vd, u8 de, cr128& vt) -> void;
+    template<u8 e> auto VRSQ(r128& vd, u8 de, cr128& vt) -> void { VRSQ<0, e>(vd, de, vt); }
+    template<u8 e> auto VRSQL(r128& vd, u8 de, cr128& vt) -> void { VRSQ<1, e>(vd, de, vt); }
+    template<u8 e> auto VRSQH(r128& vd, u8 de, cr128& vt) -> void;
+    template<u8 e> auto VSAR(r128& vd, cr128& vs) -> void;
+    template<u8 e> auto VSUB(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VSUBC(r128& vd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VXOR(r128& rd, cr128& vs, cr128& vt) -> void;
+    template<u8 e> auto VZERO(r128& rd, cr128& vs, cr128& vt) -> void;
+};
diff --git a/test/src/rsp_vu_impl.h b/test/src/rsp_vu_impl.h
new file mode 100644
index 0000000..8c22d14
--- /dev/null
+++ b/test/src/rsp_vu_impl.h
@@ -0,0 +1,1537 @@
+// This file is modified from the Ares N64 emulator core. Ares can
+// be found at https://github.com/ares-emulator/ares. The original license
+// for this portion of Ares is as follows:
+// ----------------------------------------------------------------------
+// ares
+// 
+// Copyright(c) 2004 - 2021 ares team, Near et al
+// 
+// Permission to use, copy, modify, and /or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright noticeand this permission notice appear in all copies.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS.IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// ----------------------------------------------------------------------
+
+#include <cstdint>
+#include <algorithm>
+using u32 = uint32_t;
+
+#define ACCH vpu.acch
+#define ACCM vpu.accm
+#define ACCL vpu.accl
+#define VCOH vpu.vcoh
+#define VCOL vpu.vcol
+#define VCCH vpu.vcch
+#define VCCL vpu.vccl
+#define VCE  vpu.vce
+
+#define DIVIN  vpu.divin
+#define DIVOUT vpu.divout
+#define DIVDP  vpu.divdp
+
+auto RSP::r128::operator()(u32 index) const -> r128 {
+    if constexpr (Accuracy::RSP::SISD) {
+        r128 v{ *this };
+        switch (index) {
+        case  0: break;
+        case  1: break;
+        case  2: v.u16(1) = v.u16(0); v.u16(3) = v.u16(2); v.u16(5) = v.u16(4); v.u16(7) = v.u16(6); break;
+        case  3: v.u16(0) = v.u16(1); v.u16(2) = v.u16(3); v.u16(4) = v.u16(5); v.u16(6) = v.u16(7); break;
+        case  4: v.u16(1) = v.u16(2) = v.u16(3) = v.u16(0); v.u16(5) = v.u16(6) = v.u16(7) = v.u16(4); break;
+        case  5: v.u16(0) = v.u16(2) = v.u16(3) = v.u16(1); v.u16(4) = v.u16(6) = v.u16(7) = v.u16(5); break;
+        case  6: v.u16(0) = v.u16(1) = v.u16(3) = v.u16(2); v.u16(4) = v.u16(5) = v.u16(7) = v.u16(6); break;
+        case  7: v.u16(0) = v.u16(1) = v.u16(2) = v.u16(3); v.u16(4) = v.u16(5) = v.u16(6) = v.u16(7); break;
+        case  8: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(0); break;
+        case  9: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(1); break;
+        case 10: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(2); break;
+        case 11: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(3); break;
+        case 12: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(4); break;
+        case 13: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(5); break;
+        case 14: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(6); break;
+        case 15: for (u32 n = 0; n < 8; n++) v.u16(n) = v.u16(7); break;
+        }
+        return v;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        static const __m128i shuffle[16] = {
+            //vector
+            _mm_set_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),  //01234567
+            _mm_set_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),  //01234567
+            //scalar quarter
+            _mm_set_epi8(15,14,15,14,11,10,11,10, 7, 6, 7, 6, 3, 2, 3, 2),  //00224466
+            _mm_set_epi8(13,12,13,12, 9, 8, 9, 8, 5, 4, 5, 4, 1, 0, 1, 0),  //11335577
+            //scalar half
+            _mm_set_epi8(15,14,15,14,15,14,15,14, 7, 6, 7, 6, 7, 6, 7, 6),  //00004444
+            _mm_set_epi8(13,12,13,12,13,12,13,12, 5, 4, 5, 4, 5, 4, 5, 4),  //11115555
+            _mm_set_epi8(11,10,11,10,11,10,11,10, 3, 2, 3, 2, 3, 2, 3, 2),  //22226666
+            _mm_set_epi8(9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0),  //33337777
+            //scalar whole
+            _mm_set_epi8(15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14),  //00000000
+            _mm_set_epi8(13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12),  //11111111
+            _mm_set_epi8(11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10),  //22222222
+            _mm_set_epi8(9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8),  //33333333
+            _mm_set_epi8(7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6),  //44444444
+            _mm_set_epi8(5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4),  //55555555
+            _mm_set_epi8(3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2),  //66666666
+            _mm_set_epi8(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0),  //77777777
+        };
+        //todo: benchmark to see if testing for cases 0&1 to return value directly is faster
+        r128 ret;
+        ret.v128 = _mm_shuffle_epi8(v128, shuffle[index]);
+        return ret;
+#endif
+    }
+}
+
+auto RSP::accumulatorGet(u32 index) const -> u64 {
+    return (u64)ACCH.u16(index) << 32 | (u64)ACCM.u16(index) << 16 | (u64)ACCL.u16(index) << 0;
+}
+
+auto RSP::accumulatorSet(u32 index, u64 value) -> void {
+    ACCH.u16(index) = value >> 32;
+    ACCM.u16(index) = value >> 16;
+    ACCL.u16(index) = value >> 0;
+}
+
+auto RSP::accumulatorSaturate(u32 index, bool slice, u16 negative, u16 positive) const -> u16 {
+    if (ACCH.s16(index) < 0) {
+        if (ACCH.u16(index) != 0xffff) return negative;
+        if (ACCM.s16(index) >= 0) return negative;
+    } else {
+        if (ACCH.u16(index) != 0x0000) return positive;
+        if (ACCM.s16(index) < 0) return positive;
+    }
+    return !slice ? ACCL.u16(index) : ACCM.u16(index);
+}
+
+auto RSP::CFC2(r32& rt, u8 rd) -> void {
+    r128 hi, lo;
+    switch (rd & 3) {
+    case 0x00: hi = VCOH; lo = VCOL; break;
+    case 0x01: hi = VCCH; lo = VCCL; break;
+    case 0x02: hi = zero; lo = VCE;  break;
+    case 0x03: hi = zero; lo = VCE;  break;  //unverified
+    }
+
+    if constexpr (Accuracy::RSP::SISD) {
+        rt = 0;
+        for (u32 n = 0; n < 8; n++) {
+            rt |= lo.get(n) << 0 + n;
+            rt |= hi.get(n) << 8 + n;
+        }
+        rt = s16(rt);
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        static const v128 reverse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        rt = s16(_mm_movemask_epi8(_mm_shuffle_epi8(_mm_packs_epi16(hi, lo), reverse)));
+#endif
+    }
+}
+
+auto RSP::CTC2(cr32& rt, u8 rd) -> void {
+    r128* hi; r128* lo;
+    r128 null;
+    switch (rd & 3) {
+    case 0x00: hi = &VCOH; lo = &VCOL; break;
+    case 0x01: hi = &VCCH; lo = &VCCL; break;
+    case 0x02: hi = &null; lo = &VCE;  break;
+    case 0x03: hi = &null; lo = &VCE;  break;  //unverified
+    }
+
+    if constexpr (Accuracy::RSP::SISD) {
+        for (u32 n = 0; n < 8; n++) {
+            lo->set(n, rt & 1 << 0 + n);
+            hi->set(n, rt & 1 << 8 + n);
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        static const v128 mask = _mm_set_epi16(0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080);
+        lo->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{ ~rt >> 0 }, zero), mask), zero);
+        hi->v128 = _mm_cmpeq_epi8(_mm_and_si128(_mm_shuffle_epi8(r128{ ~rt >> 8 }, zero), mask), zero);
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::LBV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm;
+    vt.byte(e) = RSP_MEM_B(0, address);
+}
+
+template<u8 e>
+auto RSP::LDV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 8;
+    auto start = e;
+    auto end = std::min(start + 8, 16);
+    for (u32 offset = start; offset < end; offset++) {
+        vt.byte(offset & 15) = RSP_MEM_B(0, address++);
+    }
+}
+
+template<u8 e>
+auto RSP::LFV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto index = (address & 7) - e;
+    address &= ~7;
+    auto start = e;
+    auto end = std::min(start + 8, 16);
+    r128 tmp;
+    for (u32 offset = 0; offset < 4; offset++) {
+        tmp.element(offset + 0) = RSP_MEM_B(0, address + (index + offset * 4 + 0 & 15)) << 7;
+        tmp.element(offset + 4) = RSP_MEM_B(0, address + (index + offset * 4 + 8 & 15)) << 7;
+    }
+    for (u32 offset = start; offset < end; offset++) {
+        vt.byte(offset) = tmp.byte(offset);
+    }
+}
+
+template<u8 e>
+auto RSP::LHV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto index = (address & 7) - e;
+    address &= ~7;
+    for (u32 offset = 0; offset < 8; offset++) {
+        vt.element(offset) = RSP_MEM_B(0, address + (index + offset * 2 & 15)) << 7;
+    }
+}
+
+template<u8 e>
+auto RSP::LLV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 4;
+    auto start = e;
+    auto end = std::min(start + 4, 16);
+    for (u32 offset = start; offset < end; offset++) {
+        vt.byte(offset & 15) = RSP_MEM_B(0, address++);
+    }
+}
+
+template<u8 e>
+auto RSP::LPV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 8;
+    auto index = (address & 7) - e;
+    address &= ~7;
+    for (u32 offset = 0; offset < 8; offset++) {
+        vt.element(offset) = RSP_MEM_B(0, address + (index + offset & 15)) << 8;
+    }
+}
+
+template<u8 e>
+auto RSP::LQV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto start = e;
+    auto end = std::min((u32)(16 + e - (address & 15)), (u32)16);
+    for (u32 offset = start; offset < end; offset++) {
+        vt.byte(offset & 15) = RSP_MEM_B(0, address++);
+    }
+}
+
+template<u8 e>
+auto RSP::LRV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto index = e;
+    auto start = 16 - ((address & 15) - index);
+    address &= ~15;
+    for (u32 offset = start; offset < 16; offset++) {
+        vt.byte(offset & 15) = RSP_MEM_B(0, address++);
+    }
+}
+
+template<u8 e>
+auto RSP::LSV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 2;
+    auto start = e;
+    auto end = std::min(start + 2, 16);
+    for (u32 offset = start; offset < end; offset++) {
+        vt.byte(offset & 15) = RSP_MEM_B(0, address++);
+    }
+}
+
+template<u8 e>
+auto RSP::LTV(u8 vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto begin = address & ~7;
+    address = begin + ((e + (address & 8)) & 15);
+    auto vtbase = vt & ~7;
+    auto vtoff = e >> 1;
+    for (u32 i = 0; i < 8; i++) {
+        vpu.r[vtbase + vtoff].byte(i * 2 + 0) = RSP_MEM_B(0, address++);
+        if (address == begin + 16) address = begin;
+        vpu.r[vtbase + vtoff].byte(i * 2 + 1) = RSP_MEM_B(0, address++);
+        if (address == begin + 16) address = begin;
+        vtoff = vtoff + 1 & 7;
+    }
+}
+
+template<u8 e>
+auto RSP::LUV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 8;
+    auto index = (address & 7) - e;
+    address &= ~7;
+    for (u32 offset = 0; offset < 8; offset++) {
+        vt.element(offset) = RSP_MEM_B(0, address + (index + offset & 15)) << 7;
+    }
+}
+
+template<u8 e>
+auto RSP::LWV(r128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto start = 16 - e;
+    auto end = e + 16;
+    for (u32 offset = start; offset < end; offset++) {
+        vt.byte(offset & 15) = RSP_MEM_B(0, address);
+        address += 4;
+    }
+}
+
+template<u8 e>
+auto RSP::MFC2(r32& rt, cr128& vs) -> void {
+    auto hi = vs.byte(e + 0 & 15);
+    auto lo = vs.byte(e + 1 & 15);
+    rt = s16(hi << 8 | lo << 0);
+}
+
+template<u8 e>
+auto RSP::MTC2(cr32& rt, r128& vs) -> void {
+    vs.byte(e + 0) = rt >> 8;
+    if (e != 15) vs.byte(e + 1) = rt >> 0;
+}
+
+template<u8 e>
+auto RSP::SBV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm;
+    RSP_MEM_B(0, address) = vt.byte(e);
+}
+
+template<u8 e>
+auto RSP::SDV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 8;
+    auto start = e;
+    auto end = start + 8;
+    for (u32 offset = start; offset < end; offset++) {
+        RSP_MEM_B(0, address++) = vt.byte(offset & 15);
+    }
+}
+
+template<u8 e>
+auto RSP::SFV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto base = address & 7;
+    address &= ~7;
+    switch (e) {
+    case 0: case 15:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(0) >> 7;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(1) >> 7;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(2) >> 7;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(3) >> 7;
+        break;
+    case 1:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(6) >> 7;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(7) >> 7;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(4) >> 7;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(5) >> 7;
+        break;
+    case 4:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(1) >> 7;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(2) >> 7;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(3) >> 7;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(0) >> 7;
+        break;
+    case 5:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(7) >> 7;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(4) >> 7;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(5) >> 7;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(6) >> 7;
+        break;
+    case 8:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(4) >> 7;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(5) >> 7;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(6) >> 7;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(7) >> 7;
+        break;
+    case 11:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(3) >> 7;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(0) >> 7;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(1) >> 7;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(2) >> 7;
+        break;
+    case 12:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = vt.element(5) >> 7;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = vt.element(6) >> 7;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = vt.element(7) >> 7;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = vt.element(4) >> 7;
+        break;
+    default:
+        RSP_MEM_B(0, address + (base + 0 & 15)) = 0;
+        RSP_MEM_B(0, address + (base + 4 & 15)) = 0;
+        RSP_MEM_B(0, address + (base + 8 & 15)) = 0;
+        RSP_MEM_B(0, address + (base + 12 & 15)) = 0;
+        break;
+    }
+}
+
+template<u8 e>
+auto RSP::SHV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto index = address & 7;
+    address &= ~7;
+    for (u32 offset = 0; offset < 8; offset++) {
+        auto byte = e + offset * 2;
+        auto value = vt.byte(byte + 0 & 15) << 1 | vt.byte(byte + 1 & 15) >> 7;
+        RSP_MEM_B(0, address + (index + offset * 2 & 15)) = value;
+    }
+}
+
+template<u8 e>
+auto RSP::SLV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 4;
+    auto start = e;
+    auto end = start + 4;
+    for (u32 offset = start; offset < end; offset++) {
+        RSP_MEM_B(0, address++) = vt.byte(offset & 15);
+    }
+}
+
+template<u8 e>
+auto RSP::SPV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 8;
+    auto start = e;
+    auto end = start + 8;
+    for (u32 offset = start; offset < end; offset++) {
+        if ((offset & 15) < 8) {
+            RSP_MEM_B(0, address++) = vt.byte((offset & 7) << 1);
+        } else {
+            RSP_MEM_B(0, address++) = vt.element(offset & 7) >> 7;
+        }
+    }
+}
+
+template<u8 e>
+auto RSP::SQV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto start = e;
+    auto end = start + (16 - (address & 15));
+    for (u32 offset = start; offset < end; offset++) {
+        RSP_MEM_B(0, address++) = vt.byte(offset & 15);
+    }
+}
+
+template<u8 e>
+auto RSP::SRV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto start = e;
+    auto end = start + (address & 15);
+    auto base = 16 - (address & 15);
+    address &= ~15;
+    for (u32 offset = start; offset < end; offset++) {
+        RSP_MEM_B(0, address++) = vt.byte(offset + base & 15);
+    }
+}
+
+template<u8 e>
+auto RSP::SSV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 2;
+    auto start = e;
+    auto end = start + 2;
+    for (u32 offset = start; offset < end; offset++) {
+        RSP_MEM_B(0, address++) = vt.byte(offset & 15);
+    }
+}
+
+template<u8 e>
+auto RSP::STV(u8 vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto start = vt & ~7;
+    auto end = start + 8;
+    auto element = 16 - (e & ~1);
+    auto base = (address & 7) - (e & ~1);
+    address &= ~7;
+    for (u32 offset = start; offset < end; offset++) {
+        RSP_MEM_B(0, address + (base++ & 15)) = vpu.r[offset].byte(element++ & 15);
+        RSP_MEM_B(0, address + (base++ & 15)) = vpu.r[offset].byte(element++ & 15);
+    }
+}
+
+template<u8 e>
+auto RSP::SUV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 8;
+    auto start = e;
+    auto end = start + 8;
+    for (u32 offset = start; offset < end; offset++) {
+        if ((offset & 15) < 8) {
+            RSP_MEM_B(0, address++) = vt.element(offset & 7) >> 7;
+        } else {
+            RSP_MEM_B(0, address++) = vt.byte((offset & 7) << 1);
+        }
+    }
+}
+
+template<u8 e>
+auto RSP::SWV(cr128& vt, cr32& rs, s8 imm) -> void {
+    auto address = rs + imm * 16;
+    auto start = e;
+    auto end = start + 16;
+    auto base = address & 7;
+    address &= ~7;
+    for (u32 offset = start; offset < end; offset++) {
+        RSP_MEM_B(0, address + (base++ & 15)) = vt.byte(offset & 15);
+    }
+}
+
+template<u8 e>
+auto RSP::VABS(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        r128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            if (vs.s16(n) < 0) {
+                if (vte.s16(n) == -32768) {
+                    ACCL.s16(n) = -32768;
+                    vd.s16(n) = 32767;
+                } else {
+                    ACCL.s16(n) = -vte.s16(n);
+                    vd.s16(n) = -vte.s16(n);
+                }
+            } else if (vs.s16(n) > 0) {
+                ACCL.s16(n) = +vte.s16(n);
+                vd.s16(n) = +vte.s16(n);
+            } else {
+                ACCL.s16(n) = 0;
+                vd.s16(n) = 0;
+            }
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vs0, slt;
+        vs0 = _mm_cmpeq_epi16(vs, zero);
+        slt = _mm_srai_epi16(vs, 15);
+        vd = _mm_andnot_si128(vs0, vt(e));
+        vd = _mm_xor_si128(vd, slt);
+        ACCL = _mm_sub_epi16(vd, slt);
+        vd = _mm_subs_epi16(vd, slt);
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VADD(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            s32 result = vs.s16(n) + vte.s16(n) + VCOL.get(n);
+            ACCL.s16(n) = result;
+            vd.s16(n) = sclamp<16>(result);
+        }
+        VCOL = zero;
+        VCOH = zero;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), sum, min, max;
+        sum = _mm_add_epi16(vs, vte);
+        ACCL = _mm_sub_epi16(sum, VCOL);
+        min = _mm_min_epi16(vs, vte);
+        max = _mm_max_epi16(vs, vte);
+        min = _mm_subs_epi16(min, VCOL);
+        vd = _mm_adds_epi16(min, max);
+        VCOL = zero;
+        VCOH = zero;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VADDC(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            u32 result = vs.u16(n) + vte.u16(n);
+            ACCL.u16(n) = result;
+            VCOL.set(n, result >> 16);
+        }
+        VCOH = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), sum;
+        sum = _mm_adds_epu16(vs, vte);
+        ACCL = _mm_add_epi16(vs, vte);
+        VCOL = _mm_cmpeq_epi16(sum, ACCL);
+        VCOL = _mm_cmpeq_epi16(VCOL, zero);
+        VCOH = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VAND(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        r128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = vs.u16(n) & vte.u16(n);
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_and_si128(vs, vt(e));
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VCH(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            if ((vs.s16(n) ^ vte.s16(n)) < 0) {
+                s16 result = vs.s16(n) + vte.s16(n);
+                ACCL.s16(n) = (result <= 0 ? -vte.s16(n) : vs.s16(n));
+                VCCL.set(n, result <= 0);
+                VCCH.set(n, vte.s16(n) < 0);
+                VCOL.set(n, 1);
+                VCOH.set(n, result != 0 && vs.u16(n) != (vte.u16(n) ^ 0xffff));
+                VCE.set(n, result == -1);
+            } else {
+                s16 result = vs.s16(n) - vte.s16(n);
+                ACCL.s16(n) = (result >= 0 ? vte.s16(n) : vs.s16(n));
+                VCCL.set(n, vte.s16(n) < 0);
+                VCCH.set(n, result >= 0);
+                VCOL.set(n, 0);
+                VCOH.set(n, result != 0 && vs.u16(n) != (vte.u16(n) ^ 0xffff));
+                VCE.set(n, 0);
+            }
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), nvt, diff, diff0, vtn, dlez, dgez, mask;
+        VCOL = _mm_xor_si128(vs, vte);
+        VCOL = _mm_cmplt_epi16(VCOL, zero);
+        nvt = _mm_xor_si128(vte, VCOL);
+        nvt = _mm_sub_epi16(nvt, VCOL);
+        diff = _mm_sub_epi16(vs, nvt);
+        diff0 = _mm_cmpeq_epi16(diff, zero);
+        vtn = _mm_cmplt_epi16(vte, zero);
+        dlez = _mm_cmpgt_epi16(diff, zero);
+        dgez = _mm_or_si128(dlez, diff0);
+        dlez = _mm_cmpeq_epi16(zero, dlez);
+        VCCH = _mm_blendv_epi8(dgez, vtn, VCOL);
+        VCCL = _mm_blendv_epi8(vtn, dlez, VCOL);
+        VCE = _mm_cmpeq_epi16(diff, VCOL);
+        VCE = _mm_and_si128(VCE, VCOL);
+        VCOH = _mm_or_si128(diff0, VCE);
+        VCOH = _mm_cmpeq_epi16(VCOH, zero);
+        mask = _mm_blendv_epi8(VCCH, VCCL, VCOL);
+        ACCL = _mm_blendv_epi8(vs, nvt, mask);
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VCL(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            if (VCOL.get(n)) {
+                if (VCOH.get(n)) {
+                    ACCL.u16(n) = VCCL.get(n) ? -vte.u16(n) : vs.u16(n);
+                } else {
+                    u16 sum = vs.u16(n) + vte.u16(n);
+                    bool carry = (vs.u16(n) + vte.u16(n)) != sum;
+                    if (VCE.get(n)) {
+                        ACCL.u16(n) = VCCL.set(n, (!sum || !carry)) ? -vte.u16(n) : vs.u16(n);
+                    } else {
+                        ACCL.u16(n) = VCCL.set(n, (!sum && !carry)) ? -vte.u16(n) : vs.u16(n);
+                    }
+                }
+            } else {
+                if (VCOH.get(n)) {
+                    ACCL.u16(n) = VCCH.get(n) ? vte.u16(n) : vs.u16(n);
+                } else {
+                    ACCL.u16(n) = VCCH.set(n, (s32)vs.u16(n) - (s32)vte.u16(n) >= 0) ? vte.u16(n) : vs.u16(n);
+                }
+            }
+        }
+        VCOL = zero;
+        VCOH = zero;
+        VCE = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), nvt, diff, ncarry, nvce, diff0, lec1, lec2, leeq, geeq, le, ge, mask;
+        nvt = _mm_xor_si128(vte, VCOL);
+        nvt = _mm_sub_epi16(nvt, VCOL);
+        diff = _mm_sub_epi16(vs, nvt);
+        ncarry = _mm_adds_epu16(vs, vte);
+        ncarry = _mm_cmpeq_epi16(diff, ncarry);
+        nvce = _mm_cmpeq_epi16(VCE, zero);
+        diff0 = _mm_cmpeq_epi16(diff, zero);
+        lec1 = _mm_and_si128(diff0, ncarry);
+        lec1 = _mm_and_si128(nvce, lec1);
+        lec2 = _mm_or_si128(diff0, ncarry);
+        lec2 = _mm_and_si128(VCE, lec2);
+        leeq = _mm_or_si128(lec1, lec2);
+        geeq = _mm_subs_epu16(vte, vs);
+        geeq = _mm_cmpeq_epi16(geeq, zero);
+        le = _mm_andnot_si128(VCOH, VCOL);
+        le = _mm_blendv_epi8(VCCL, leeq, le);
+        ge = _mm_or_si128(VCOL, VCOH);
+        ge = _mm_blendv_epi8(geeq, VCCH, ge);
+        mask = _mm_blendv_epi8(ge, le, VCOL);
+        ACCL = _mm_blendv_epi8(vs, nvt, mask);
+        VCCH = ge;
+        VCCL = le;
+        VCOH = zero;
+        VCOL = zero;
+        VCE = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VCR(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            if ((vs.s16(n) ^ vte.s16(n)) < 0) {
+                VCCH.set(n, vte.s16(n) < 0);
+                ACCL.u16(n) = VCCL.set(n, vs.s16(n) + vte.s16(n) + 1 <= 0) ? ~vte.u16(n) : vs.u16(n);
+            } else {
+                VCCL.set(n, vte.s16(n) < 0);
+                ACCL.u16(n) = VCCH.set(n, vs.s16(n) - vte.s16(n) >= 0) ? vte.u16(n) : vs.u16(n);
+            }
+        }
+        VCOL = zero;
+        VCOH = zero;
+        VCE = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), sign, dlez, dgez, nvt, mask;
+        sign = _mm_xor_si128(vs, vte);
+        sign = _mm_srai_epi16(sign, 15);
+        dlez = _mm_and_si128(vs, sign);
+        dlez = _mm_add_epi16(dlez, vte);
+        VCCL = _mm_srai_epi16(dlez, 15);
+        dgez = _mm_or_si128(vs, sign);
+        dgez = _mm_min_epi16(dgez, vte);
+        VCCH = _mm_cmpeq_epi16(dgez, vte);
+        nvt = _mm_xor_si128(vte, sign);
+        mask = _mm_blendv_epi8(VCCH, VCCL, sign);
+        ACCL = _mm_blendv_epi8(vs, nvt, mask);
+        vd = ACCL;
+        VCOL = zero;
+        VCOH = zero;
+        VCE = zero;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VEQ(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = VCCL.set(n, !VCOH.get(n) && vs.u16(n) == vte.u16(n)) ? vs.u16(n) : vte.u16(n);
+        }
+        VCCH = zero;  //unverified
+        VCOL = zero;
+        VCOH = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), eq;
+        eq = _mm_cmpeq_epi16(vs, vte);
+        VCCL = _mm_andnot_si128(VCOH, eq);
+        ACCL = _mm_blendv_epi8(vte, vs, VCCL);
+        VCCH = zero;  //unverified
+        VCOH = zero;
+        VCOL = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VGE(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = VCCL.set(n, vs.s16(n) > vte.s16(n) || (vs.s16(n) == vte.s16(n) && (!VCOL.get(n) || !VCOH.get(n)))) ? vs.u16(n) : vte.u16(n);
+        }
+        VCCH = zero;  //unverified
+        VCOL = zero;
+        VCOH = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), eq, gt, es;
+        eq = _mm_cmpeq_epi16(vs, vte);
+        gt = _mm_cmpgt_epi16(vs, vte);
+        es = _mm_and_si128(VCOH, VCOL);
+        eq = _mm_andnot_si128(es, eq);
+        VCCL = _mm_or_si128(gt, eq);
+        ACCL = _mm_blendv_epi8(vte, vs, VCCL);
+        VCCH = zero;
+        VCOH = zero;
+        VCOL = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VLT(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = VCCL.set(n, vs.s16(n) < vte.s16(n) || (vs.s16(n) == vte.s16(n) && VCOL.get(n) && VCOH.get(n))) ? vs.u16(n) : vte.u16(n);
+        }
+        VCCH = zero;
+        VCOL = zero;
+        VCOH = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), eq, lt;
+        eq = _mm_cmpeq_epi16(vs, vte);
+        lt = _mm_cmplt_epi16(vs, vte);
+        eq = _mm_and_si128(VCOH, eq);
+        eq = _mm_and_si128(VCOL, eq);
+        VCCL = _mm_or_si128(lt, eq);
+        ACCL = _mm_blendv_epi8(vte, vs, VCCL);
+        VCCH = zero;
+        VCOH = zero;
+        VCOL = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+template<bool U, u8 e>
+auto RSP::VMACF(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, accumulatorGet(n) + (s64)vs.s16(n) * (s64)vte.s16(n) * 2);
+            if constexpr (U == 0) {
+                vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff);
+            }
+            if constexpr (U == 1) {
+                vd.u16(n) = ACCH.s16(n) < 0 ? 0x0000 : ACCH.s16(n) || ACCM.s16(n) < 0 ? 0xffff : ACCM.u16(n);
+            }
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), lo, md, hi, carry, omask;
+        lo = _mm_mullo_epi16(vs, vte);
+        hi = _mm_mulhi_epi16(vs, vte);
+        md = _mm_slli_epi16(hi, 1);
+        carry = _mm_srli_epi16(lo, 15);
+        hi = _mm_srai_epi16(hi, 15);
+        md = _mm_or_si128(md, carry);
+        lo = _mm_slli_epi16(lo, 1);
+        omask = _mm_adds_epu16(ACCL, lo);
+        ACCL = _mm_add_epi16(ACCL, lo);
+        omask = _mm_cmpeq_epi16(ACCL, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        md = _mm_sub_epi16(md, omask);
+        carry = _mm_cmpeq_epi16(md, zero);
+        carry = _mm_and_si128(carry, omask);
+        hi = _mm_sub_epi16(hi, carry);
+        omask = _mm_adds_epu16(ACCM, md);
+        ACCM = _mm_add_epi16(ACCM, md);
+        omask = _mm_cmpeq_epi16(ACCM, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        ACCH = _mm_add_epi16(ACCH, hi);
+        ACCH = _mm_sub_epi16(ACCH, omask);
+        if constexpr (!U) {
+            lo = _mm_unpacklo_epi16(ACCM, ACCH);
+            hi = _mm_unpackhi_epi16(ACCM, ACCH);
+            vd = _mm_packs_epi32(lo, hi);
+        } else {
+            r128 mmask, hmask;
+            mmask = _mm_srai_epi16(ACCM, 15);
+            hmask = _mm_srai_epi16(ACCH, 15);
+            md = _mm_or_si128(mmask, ACCM);
+            omask = _mm_cmpgt_epi16(ACCH, zero);
+            md = _mm_andnot_si128(hmask, md);
+            vd = _mm_or_si128(omask, md);
+        }
+#endif
+    }
+}
+
+auto RSP::VMACQ(r128& vd) -> void {
+    for (u32 n = 0; n < 8; n++) {
+        s32 product = ACCH.element(n) << 16 | ACCM.element(n) << 0;
+        if (product < 0 && !(product & 1 << 5)) product += 32;
+        else if (product >= 32 && !(product & 1 << 5)) product -= 32;
+        ACCH.element(n) = product >> 16;
+        ACCM.element(n) = product >> 0;
+        vd.element(n) = sclamp<16>(product >> 1) & ~15;
+    }
+}
+
+template<u8 e>
+auto RSP::VMADH(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            s32 result = (accumulatorGet(n) >> 16) + vs.s16(n) * vte.s16(n);
+            ACCH.u16(n) = result >> 16;
+            ACCM.u16(n) = result >> 0;
+            vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff);
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), lo, hi, omask;
+        lo = _mm_mullo_epi16(vs, vte);
+        hi = _mm_mulhi_epi16(vs, vte);
+        omask = _mm_adds_epu16(ACCM, lo);
+        ACCM = _mm_add_epi16(ACCM, lo);
+        omask = _mm_cmpeq_epi16(ACCM, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        hi = _mm_sub_epi16(hi, omask);
+        ACCH = _mm_add_epi16(ACCH, hi);
+        lo = _mm_unpacklo_epi16(ACCM, ACCH);
+        hi = _mm_unpackhi_epi16(ACCM, ACCH);
+        vd = _mm_packs_epi32(lo, hi);
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMADL(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, accumulatorGet(n) + (u32(vs.u16(n) * vte.u16(n)) >> 16));
+            vd.u16(n) = accumulatorSaturate(n, 0, 0x0000, 0xffff);
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), hi, omask, nhi, nmd, shi, smd, cmask, cval;
+        hi = _mm_mulhi_epu16(vs, vte);
+        omask = _mm_adds_epu16(ACCL, hi);
+        ACCL = _mm_add_epi16(ACCL, hi);
+        omask = _mm_cmpeq_epi16(ACCL, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        hi = _mm_sub_epi16(zero, omask);
+        omask = _mm_adds_epu16(ACCM, hi);
+        ACCM = _mm_add_epi16(ACCM, hi);
+        omask = _mm_cmpeq_epi16(ACCM, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        ACCH = _mm_sub_epi16(ACCH, omask);
+        nhi = _mm_srai_epi16(ACCH, 15);
+        nmd = _mm_srai_epi16(ACCM, 15);
+        shi = _mm_cmpeq_epi16(nhi, ACCH);
+        smd = _mm_cmpeq_epi16(nhi, nmd);
+        cmask = _mm_and_si128(smd, shi);
+        cval = _mm_cmpeq_epi16(nhi, zero);
+        vd = _mm_blendv_epi8(cval, ACCL, cmask);
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMADM(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, accumulatorGet(n) + vs.s16(n) * vte.u16(n));
+            vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff);
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), lo, hi, sign, vta, omask;
+        lo = _mm_mullo_epi16(vs, vte);
+        hi = _mm_mulhi_epu16(vs, vte);
+        sign = _mm_srai_epi16(vs, 15);
+        vta = _mm_and_si128(vte, sign);
+        hi = _mm_sub_epi16(hi, vta);
+        omask = _mm_adds_epu16(ACCL, lo);
+        ACCL = _mm_add_epi16(ACCL, lo);
+        omask = _mm_cmpeq_epi16(ACCL, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        hi = _mm_sub_epi16(hi, omask);
+        omask = _mm_adds_epu16(ACCM, hi);
+        ACCM = _mm_add_epi16(ACCM, hi);
+        omask = _mm_cmpeq_epi16(ACCM, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        hi = _mm_srai_epi16(hi, 15);
+        ACCH = _mm_add_epi16(ACCH, hi);
+        ACCH = _mm_sub_epi16(ACCH, omask);
+        lo = _mm_unpacklo_epi16(ACCM, ACCH);
+        hi = _mm_unpackhi_epi16(ACCM, ACCH);
+        vd = _mm_packs_epi32(lo, hi);
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMADN(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, accumulatorGet(n) + s64(vs.u16(n) * vte.s16(n)));
+            vd.u16(n) = accumulatorSaturate(n, 0, 0x0000, 0xffff);
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), lo, hi, sign, vsa, omask, nhi, nmd, shi, smd, cmask, cval;
+        lo = _mm_mullo_epi16(vs, vte);
+        hi = _mm_mulhi_epu16(vs, vte);
+        sign = _mm_srai_epi16(vte, 15);
+        vsa = _mm_and_si128(vs, sign);
+        hi = _mm_sub_epi16(hi, vsa);
+        omask = _mm_adds_epu16(ACCL, lo);
+        ACCL = _mm_add_epi16(ACCL, lo);
+        omask = _mm_cmpeq_epi16(ACCL, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        hi = _mm_sub_epi16(hi, omask);
+        omask = _mm_adds_epu16(ACCM, hi);
+        ACCM = _mm_add_epi16(ACCM, hi);
+        omask = _mm_cmpeq_epi16(ACCM, omask);
+        omask = _mm_cmpeq_epi16(omask, zero);
+        hi = _mm_srai_epi16(hi, 15);
+        ACCH = _mm_add_epi16(ACCH, hi);
+        ACCH = _mm_sub_epi16(ACCH, omask);
+        nhi = _mm_srai_epi16(ACCH, 15);
+        nmd = _mm_srai_epi16(ACCM, 15);
+        shi = _mm_cmpeq_epi16(nhi, ACCH);
+        smd = _mm_cmpeq_epi16(nhi, nmd);
+        cmask = _mm_and_si128(smd, shi);
+        cval = _mm_cmpeq_epi16(nhi, zero);
+        vd = _mm_blendv_epi8(cval, ACCL, cmask);
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMOV(r128& vd, u8 de, cr128& vt) -> void {
+    cr128 vte = vt(e);
+    vd.u16(de) = vte.u16(de);
+    ACCL = vte;
+}
+
+template<u8 e>
+auto RSP::VMRG(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = VCCL.get(n) ? vs.u16(n) : vte.u16(n);
+        }
+        VCOH = zero;
+        VCOL = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_blendv_epi8(vt(e), vs, VCCL);
+        VCOH = zero;
+        VCOL = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMUDH(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, s64(vs.s16(n) * vte.s16(n)) << 16);
+            vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff);
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), lo, hi;
+        ACCL = zero;
+        ACCM = _mm_mullo_epi16(vs, vte);
+        ACCH = _mm_mulhi_epi16(vs, vte);
+        lo = _mm_unpacklo_epi16(ACCM, ACCH);
+        hi = _mm_unpackhi_epi16(ACCM, ACCH);
+        vd = _mm_packs_epi32(lo, hi);
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMUDL(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, u16(vs.u16(n) * vte.u16(n) >> 16));
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_mulhi_epu16(vs, vt(e));
+        ACCM = zero;
+        ACCH = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMUDM(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, s32(vs.s16(n) * vte.u16(n)));
+        }
+        vd = ACCM;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), sign, vta;
+        ACCL = _mm_mullo_epi16(vs, vte);
+        ACCM = _mm_mulhi_epu16(vs, vte);
+        sign = _mm_srai_epi16(vs, 15);
+        vta = _mm_and_si128(vte, sign);
+        ACCM = _mm_sub_epi16(ACCM, vta);
+        ACCH = _mm_srai_epi16(ACCM, 15);
+        vd = ACCM;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMUDN(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, s32(vs.u16(n) * vte.s16(n)));
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), sign, vsa;
+        ACCL = _mm_mullo_epi16(vs, vte);
+        ACCM = _mm_mulhi_epu16(vs, vte);
+        sign = _mm_srai_epi16(vte, 15);
+        vsa = _mm_and_si128(vs, sign);
+        ACCM = _mm_sub_epi16(ACCM, vsa);
+        ACCH = _mm_srai_epi16(ACCM, 15);
+        vd = ACCL;
+#endif
+    }
+}
+
+template<bool U, u8 e>
+auto RSP::VMULF(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            accumulatorSet(n, (s64)vs.s16(n) * (s64)vte.s16(n) * 2 + 0x8000);
+            if constexpr (U == 0) {
+                vd.u16(n) = accumulatorSaturate(n, 1, 0x8000, 0x7fff);
+            }
+            if constexpr (U == 1) {
+                vd.u16(n) = ACCH.s16(n) < 0 ? 0x0000 : (ACCH.s16(n) ^ ACCM.s16(n)) < 0 ? 0xffff : ACCM.u16(n);
+            }
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), lo, hi, round, sign1, sign2, neq, eq, neg;
+        lo = _mm_mullo_epi16(vs, vte);
+        round = _mm_cmpeq_epi16(zero, zero);
+        sign1 = _mm_srli_epi16(lo, 15);
+        lo = _mm_add_epi16(lo, lo);
+        round = _mm_slli_epi16(round, 15);
+        hi = _mm_mulhi_epi16(vs, vte);
+        sign2 = _mm_srli_epi16(lo, 15);
+        ACCL = _mm_add_epi16(round, lo);
+        sign1 = _mm_add_epi16(sign1, sign2);
+        hi = _mm_slli_epi16(hi, 1);
+        neq = _mm_cmpeq_epi16(vs, vte);
+        ACCM = _mm_add_epi16(hi, sign1);
+        neg = _mm_srai_epi16(ACCM, 15);
+        if constexpr (!U) {
+            eq = _mm_and_si128(neq, neg);
+            ACCH = _mm_andnot_si128(neq, neg);
+            vd = _mm_add_epi16(ACCM, eq);
+        } else {
+            ACCH = _mm_andnot_si128(neq, neg);
+            hi = _mm_or_si128(ACCM, neg);
+            vd = _mm_andnot_si128(ACCH, hi);
+        }
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VMULQ(r128& vd, cr128& vs, cr128& vt) -> void {
+    cr128 vte = vt(e);
+    for (u32 n = 0; n < 8; n++) {
+        s32 product = (s16)vs.element(n) * (s16)vte.element(n);
+        if (product < 0) product += 31;  //round
+        ACCH.element(n) = product >> 16;
+        ACCM.element(n) = product >> 0;
+        ACCL.element(n) = 0;
+        vd.element(n) = sclamp<16>(product >> 1) & ~15;
+    }
+}
+
+template<u8 e>
+auto RSP::VNAND(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = ~(vs.u16(n) & vte.u16(n));
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_and_si128(vs, vt(e));
+        ACCL = _mm_xor_si128(ACCL, invert);
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VNE(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = VCCL.set(n, vs.u16(n) != vte.u16(n) || VCOH.get(n)) ? vs.u16(n) : vte.u16(n);
+        }
+        VCCH = zero;  //unverified
+        VCOL = zero;
+        VCOH = zero;
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), eq, ne;
+        eq = _mm_cmpeq_epi16(vs, vte);
+        ne = _mm_cmpeq_epi16(eq, zero);
+        VCCL = _mm_and_si128(VCOH, eq);
+        VCCL = _mm_or_si128(VCCL, ne);
+        ACCL = _mm_blendv_epi8(vte, vs, VCCL);
+        VCCH = zero;
+        VCOH = zero;
+        VCOL = zero;
+        vd = ACCL;
+#endif
+    }
+}
+
+auto RSP::VNOP() -> void {
+}
+
+template<u8 e>
+auto RSP::VNOR(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = ~(vs.u16(n) | vte.u16(n));
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_or_si128(vs, vt(e));
+        ACCL = _mm_xor_si128(ACCL, invert);
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VNXOR(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = ~(vs.u16(n) ^ vte.u16(n));
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_xor_si128(vs, vt(e));
+        ACCL = _mm_xor_si128(ACCL, invert);
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VOR(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = vs.u16(n) | vte.u16(n);
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_or_si128(vs, vt(e));
+        vd = ACCL;
+#endif
+    }
+}
+
+template<bool L, u8 e>
+auto RSP::VRCP(r128& vd, u8 de, cr128& vt) -> void {
+    s32 result = 0;
+    s32 input = L && DIVDP ? DIVIN << 16 | vt.element(e & 7) : s16(vt.element(e & 7));
+    s32 mask = input >> 31;
+    s32 data = input ^ mask;
+    if (input > -32768) data -= mask;
+    if (data == 0) {
+        result = 0x7fff'ffff;
+    } else if (input == -32768) {
+        result = 0xffff'0000;
+    } else {
+        u32 shift = __builtin_clz(data);
+        u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22;
+        result = rspReciprocals[index];
+        result = (0x10000 | result) << 14;
+        result = result >> 31 - shift ^ mask;
+    }
+    DIVDP = 0;
+    DIVOUT = result >> 16;
+    ACCL = vt(e);
+    vd.element(de) = result;
+}
+
+template<u8 e>
+auto RSP::VRCPH(r128& vd, u8 de, cr128& vt) -> void {
+    ACCL = vt(e);
+    DIVDP = 1;
+    DIVIN = vt.element(e & 7);
+    vd.element(de) = DIVOUT;
+}
+
+template<bool D, u8 e>
+auto RSP::VRND(r128& vd, u8 vs, cr128& vt) -> void {
+    cr128 vte = vt(e);
+    for (u32 n = 0; n < 8; n++) {
+        s32 product = (s16)vte.element(n);
+        if (vs & 1) product <<= 16;
+        s64 acc = 0;
+        acc |= ACCH.element(n); acc <<= 16;
+        acc |= ACCM.element(n); acc <<= 16;
+        acc |= ACCL.element(n); acc <<= 16;
+        acc >>= 16;
+        if (D == 0 && acc < 0) acc = sclip<48>(acc + product);
+        if (D == 1 && acc >= 0) acc = sclip<48>(acc + product);
+        ACCH.element(n) = acc >> 32;
+        ACCM.element(n) = acc >> 16;
+        ACCL.element(n) = acc >> 0;
+        vd.element(n) = sclamp<16>(acc >> 16);
+    }
+}
+
+template<bool L, u8 e>
+auto RSP::VRSQ(r128& vd, u8 de, cr128& vt) -> void {
+    s32 result = 0;
+    s32 input = L && DIVDP ? DIVIN << 16 | vt.element(e & 7) : s16(vt.element(e & 7));
+    s32 mask = input >> 31;
+    s32 data = input ^ mask;
+    if (input > -32768) data -= mask;
+    if (data == 0) {
+        result = 0x7fff'ffff;
+    } else if (input == -32768) {
+        result = 0xffff'0000;
+    } else {
+        u32 shift = __builtin_clz(data);
+        u32 index = (u64(data) << shift & 0x7fc0'0000) >> 22;
+        result = rspInverseSquareRoots[index & 0x1fe | shift & 1];
+        result = (0x10000 | result) << 14;
+        result = result >> (31 - shift >> 1) ^ mask;
+    }
+    DIVDP = 0;
+    DIVOUT = result >> 16;
+    ACCL = vt(e);
+    vd.element(de) = result;
+}
+
+template<u8 e>
+auto RSP::VRSQH(r128& vd, u8 de, cr128& vt) -> void {
+    ACCL = vt(e);
+    DIVDP = 1;
+    DIVIN = vt.element(e & 7);
+    vd.element(de) = DIVOUT;
+}
+
+template<u8 e>
+auto RSP::VSAR(r128& vd, cr128& vs) -> void {
+    switch (e) {
+    case 0x8: vd = ACCH; break;
+    case 0x9: vd = ACCM; break;
+    case 0xa: vd = ACCL; break;
+    default:  vd = zero; break;
+    }
+}
+
+template<u8 e>
+auto RSP::VSUB(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            s32 result = vs.s16(n) - vte.s16(n) - VCOL.get(n);
+            ACCL.s16(n) = result;
+            vd.s16(n) = sclamp<16>(result);
+        }
+        VCOL = zero;
+        VCOH = zero;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), udiff, sdiff, ov;
+        udiff = _mm_sub_epi16(vte, VCOL);
+        sdiff = _mm_subs_epi16(vte, VCOL);
+        ACCL = _mm_sub_epi16(vs, udiff);
+        ov = _mm_cmpgt_epi16(sdiff, udiff);
+        vd = _mm_subs_epi16(vs, sdiff);
+        vd = _mm_adds_epi16(vd, ov);
+        VCOL = zero;
+        VCOH = zero;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VSUBC(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            u32 result = vs.u16(n) - vte.u16(n);
+            ACCL.u16(n) = result;
+            VCOL.set(n, result >> 16);
+            VCOH.set(n, result != 0);
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), equal, udiff, diff0;
+        udiff = _mm_subs_epu16(vs, vte);
+        equal = _mm_cmpeq_epi16(vs, vte);
+        diff0 = _mm_cmpeq_epi16(udiff, zero);
+        VCOH = _mm_cmpeq_epi16(equal, zero);
+        VCOL = _mm_andnot_si128(equal, diff0);
+        ACCL = _mm_sub_epi16(vs, vte);
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VXOR(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            ACCL.u16(n) = vs.u16(n) ^ vte.u16(n);
+        }
+        vd = ACCL;
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        ACCL = _mm_xor_si128(vs, vt(e));
+        vd = ACCL;
+#endif
+    }
+}
+
+template<u8 e>
+auto RSP::VZERO(r128& vd, cr128& vs, cr128& vt) -> void {
+    if constexpr (Accuracy::RSP::SISD) {
+        cr128 vte = vt(e);
+        for (u32 n = 0; n < 8; n++) {
+            s32 result = vs.s16(n) + vte.s16(n);
+            ACCL.s16(n) = result;
+            vd.s16(n) = 0;
+        }
+    }
+
+    if constexpr (Accuracy::RSP::SIMD) {
+#if ARCHITECTURE_SUPPORTS_SSE4_1
+        r128 vte = vt(e), sum, min, max;
+        ACCL = _mm_add_epi16(vs, vte);
+        vd = _mm_xor_si128(vd, vd);
+#endif
+    }
+}
+
+#undef ACCH
+#undef ACCM
+#undef ACCL
+#undef VCOH
+#undef VCOL
+#undef VCCH
+#undef VCCL
+#undef VCE
+
+#undef DIVIN
+#undef DIVOUT
+#undef DIVDP