From bbc7844021dc34e26285a495ed86bad088b87279 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 15:39:56 -0800
Subject: [PATCH 1/9] VideoCore: Change misleading register names

A few registers had names such as "count" or "number" when they actually
contained the maximum (that is, count - 1). This can easily lead to hard
to notice off by one errors.
---
 src/video_core/command_processor.cpp             | 5 +++--
 src/video_core/pica.h                            | 8 ++++----
 src/video_core/renderer_opengl/gl_rasterizer.cpp | 2 +-
 src/video_core/renderer_opengl/gl_rasterizer.h   | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index eb79974a8..9c0ed79c7 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -138,7 +138,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 immediate_input.attr[immediate_attribute_id++] = attribute;
 
-                if (immediate_attribute_id >= regs.vs.num_input_attributes + 1) {
+                if (immediate_attribute_id >= regs.vs.max_input_attribute_index + 1) {
                     MICROPROFILE_SCOPE(GPU_Drawing);
                     immediate_attribute_id = 0;
 
@@ -150,7 +150,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
                                                  static_cast<void*>(&immediate_input));
                     Shader::UnitState shader_unit;
-                    shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1);
+                    shader_unit.LoadInputVertex(immediate_input,
+                                                regs.vs.max_input_attribute_index + 1);
                     shader_engine->Run(g_state.vs, shader_unit);
                     auto output_vertex = Shader::OutputVertex::FromRegisters(
                         shader_unit.registers.output, regs, regs.vs.output_mask);
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index b2db609ec..5afc9d5dd 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -868,7 +868,7 @@ struct Regs {
         LightSrc light[8];
         LightColor global_ambient; // Emission + (material.ambient * lighting.ambient)
         INSERT_PADDING_WORDS(0x1);
-        BitField<0, 3, u32> num_lights; // Number of enabled lights - 1
+        BitField<0, 3, u32> max_light_index; // Number of enabled lights - 1
 
         union {
             BitField<2, 2, LightingFresnelSelector> fresnel_selector;
@@ -1045,7 +1045,7 @@ struct Regs {
             BitField<48, 12, u64> attribute_mask;
 
             // number of total attributes minus 1
-            BitField<60, 4, u64> num_extra_attributes;
+            BitField<60, 4, u64> max_attribute_index;
         };
 
         inline VertexAttributeFormat GetFormat(int n) const {
@@ -1076,7 +1076,7 @@ struct Regs {
         }
 
         inline int GetNumTotalAttributes() const {
-            return (int)num_extra_attributes + 1;
+            return (int)max_attribute_index + 1;
         }
 
         // Attribute loaders map the source vertex data to input attributes
@@ -1214,7 +1214,7 @@ struct Regs {
 
         union {
             // Number of input attributes to shader unit - 1
-            BitField<0, 4, u32> num_input_attributes;
+            BitField<0, 4, u32> max_input_attribute_index;
         };
 
         // Offset to shader program entry point (in words)
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 2d2f4edc1..9dd9ae0fb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -467,7 +467,7 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
 
     // Fragment lighting switches
     case PICA_REG_INDEX(lighting.disable):
-    case PICA_REG_INDEX(lighting.num_lights):
+    case PICA_REG_INDEX(lighting.max_light_index):
     case PICA_REG_INDEX(lighting.config0):
     case PICA_REG_INDEX(lighting.config1):
     case PICA_REG_INDEX(lighting.abs_lut_input):
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index cc3e4bed5..a1aa07074 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -84,7 +84,7 @@ union PicaShaderConfig {
         // Fragment lighting
 
         state.lighting.enable = !regs.lighting.disable;
-        state.lighting.src_num = regs.lighting.num_lights + 1;
+        state.lighting.src_num = regs.lighting.max_light_index + 1;
 
         for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) {
             unsigned num = regs.lighting.light_enable.GetNum(light_index);

From ab6954e942654fb003964fc95c0846aa8b89ac91 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 16:42:19 -0800
Subject: [PATCH 2/9] VideoCore: Rename some types to more accurate names

---
 src/citra_qt/debugger/graphics/graphics_tracing.cpp   |  4 ++--
 .../debugger/graphics/graphics_vertex_shader.h        |  2 +-
 src/video_core/command_processor.cpp                  | 11 +++++------
 src/video_core/pica_state.h                           |  4 ++--
 src/video_core/shader/shader.cpp                      |  2 +-
 src/video_core/shader/shader.h                        |  4 ++--
 src/video_core/shader/shader_interpreter.cpp          |  4 ++--
 src/video_core/shader/shader_interpreter.h            |  2 +-
 src/video_core/vertex_loader.cpp                      |  5 +++--
 src/video_core/vertex_loader.h                        |  4 ++--
 10 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/citra_qt/debugger/graphics/graphics_tracing.cpp b/src/citra_qt/debugger/graphics/graphics_tracing.cpp
index 716ed50b8..17f1c5ce2 100644
--- a/src/citra_qt/debugger/graphics/graphics_tracing.cpp
+++ b/src/citra_qt/debugger/graphics/graphics_tracing.cpp
@@ -71,8 +71,8 @@ void GraphicsTracingWidget::StartRecording() {
     std::array<u32, 4 * 16> default_attributes;
     for (unsigned i = 0; i < 16; ++i) {
         for (unsigned comp = 0; comp < 3; ++comp) {
-            default_attributes[4 * i + comp] =
-                nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32());
+            default_attributes[4 * i + comp] = nihstro::to_float24(
+                Pica::g_state.input_default_attributes.attr[i][comp].ToFloat32());
         }
     }
 
diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
index 3292573f3..c249a2ff8 100644
--- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
+++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
@@ -82,7 +82,7 @@ private:
 
     nihstro::ShaderInfo info;
     Pica::Shader::DebugData<true> debug_data;
-    Pica::Shader::InputVertex input_vertex;
+    Pica::Shader::AttributeBuffer input_vertex;
 
     friend class GraphicsVertexShaderModel;
 };
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 9c0ed79c7..45b994b46 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -125,7 +125,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             // TODO: Verify that this actually modifies the register!
             if (setup.index < 15) {
-                g_state.vs_default_attributes[setup.index] = attribute;
+                g_state.input_default_attributes.attr[setup.index] = attribute;
                 setup.index++;
             } else {
                 // Put each attribute into an immediate input buffer.
@@ -138,7 +138,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 immediate_input.attr[immediate_attribute_id++] = attribute;
 
-                if (immediate_attribute_id >= regs.vs.max_input_attribute_index + 1) {
+                if (immediate_attribute_id > regs.vs.max_input_attribute_index) {
                     MICROPROFILE_SCOPE(GPU_Drawing);
                     immediate_attribute_id = 0;
 
@@ -150,8 +150,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
                                                  static_cast<void*>(&immediate_input));
                     Shader::UnitState shader_unit;
-                    shader_unit.LoadInputVertex(immediate_input,
-                                                regs.vs.max_input_attribute_index + 1);
+                    shader_unit.LoadInput(immediate_input, regs.vs.max_input_attribute_index + 1);
                     shader_engine->Run(g_state.vs, shader_unit);
                     auto output_vertex = Shader::OutputVertex::FromRegisters(
                         shader_unit.registers.output, regs, regs.vs.output_mask);
@@ -281,14 +280,14 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             if (!vertex_cache_hit) {
                 // Initialize data for the current vertex
-                Shader::InputVertex input;
+                Shader::AttributeBuffer input;
                 loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
 
                 // Send to vertex shader
                 if (g_debug_context)
                     g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
                                              (void*)&input);
-                shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes());
+                shader_unit.LoadInput(input, loader.GetNumTotalAttributes());
                 shader_engine->Run(g_state.vs, shader_unit);
 
                 // Retrieve vertex from register data
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index e4f2e6d5d..785d05650 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -23,7 +23,7 @@ struct State {
     Shader::ShaderSetup vs;
     Shader::ShaderSetup gs;
 
-    std::array<Math::Vec4<float24>, 16> vs_default_attributes;
+    Shader::AttributeBuffer input_default_attributes;
 
     struct {
         union LutEntry {
@@ -66,7 +66,7 @@ struct State {
     /// Struct used to describe immediate mode rendering state
     struct ImmediateModeState {
         // Used to buffer partial vertices for immediate-mode rendering.
-        Shader::InputVertex input_vertex;
+        Shader::AttributeBuffer input_vertex;
         // Index of the next attribute to be loaded into `input_vertex`.
         u32 current_attribute = 0;
     } immediate;
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 2da50bd62..971ce5b7a 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -71,7 +71,7 @@ OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], co
     return ret;
 }
 
-void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) {
+void UnitState::LoadInput(const AttributeBuffer& input, int num_attributes) {
     // Setup input register table
     const auto& attribute_register_map = g_state.regs.vs.input_register_map;
 
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 44d9f76c3..cb38ec0a6 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -23,7 +23,7 @@ namespace Pica {
 
 namespace Shader {
 
-struct InputVertex {
+struct AttributeBuffer {
     alignas(16) Math::Vec4<float24> attr[16];
 };
 
@@ -140,7 +140,7 @@ struct UnitState {
      * @param input Input vertex into the shader
      * @param num_attributes The number of vertex shader attributes to load
      */
-    void LoadInputVertex(const InputVertex& input, int num_attributes);
+    void LoadInput(const AttributeBuffer& input, int num_attributes);
 };
 
 struct ShaderSetup {
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index c0c89b857..d803aebbf 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -668,14 +668,14 @@ void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const {
 }
 
 DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
-                                                    const InputVertex& input,
+                                                    const AttributeBuffer& input,
                                                     int num_attributes) const {
     UnitState state;
     DebugData<true> debug_data;
 
     // Setup input register table
     boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
-    state.LoadInputVertex(input, num_attributes);
+    state.LoadInput(input, num_attributes);
     RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
     return debug_data;
 }
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index d6c0e2d8c..593e02157 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -23,7 +23,7 @@ public:
      * @param config Configuration object for the shader pipeline
      * @return Debug information for this shader with regards to the given vertex
      */
-    DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input,
+    DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const AttributeBuffer& input,
                                      int num_attributes) const;
 };
 
diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp
index 2b8ef7018..bf83b61ca 100644
--- a/src/video_core/vertex_loader.cpp
+++ b/src/video_core/vertex_loader.cpp
@@ -70,7 +70,8 @@ void VertexLoader::Setup(const Pica::Regs& regs) {
     is_setup = true;
 }
 
-void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input,
+void VertexLoader::LoadVertex(u32 base_address, int index, int vertex,
+                              Shader::AttributeBuffer& input,
                               DebugUtils::MemoryAccessTracker& memory_accesses) {
     ASSERT_MSG(is_setup, "A VertexLoader needs to be setup before loading vertices.");
 
@@ -142,7 +143,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::I
                       input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
         } else if (vertex_attribute_is_default[i]) {
             // Load the default attribute if we're configured to do so
-            input.attr[i] = g_state.vs_default_attributes[i];
+            input.attr[i] = g_state.input_default_attributes.attr[i];
             LOG_TRACE(HW_GPU,
                       "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", i,
                       vertex, index, input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
diff --git a/src/video_core/vertex_loader.h b/src/video_core/vertex_loader.h
index 9f2098bb2..51f3d45b4 100644
--- a/src/video_core/vertex_loader.h
+++ b/src/video_core/vertex_loader.h
@@ -11,7 +11,7 @@ class MemoryAccessTracker;
 }
 
 namespace Shader {
-struct InputVertex;
+struct AttributeBuffer;
 }
 
 class VertexLoader {
@@ -22,7 +22,7 @@ public:
     }
 
     void Setup(const Pica::Regs& regs);
-    void LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input,
+    void LoadVertex(u32 base_address, int index, int vertex, Shader::AttributeBuffer& input,
                     DebugUtils::MemoryAccessTracker& memory_accesses);
 
     int GetNumTotalAttributes() const {

From fccb28d2e9f2f813230912e5cf1fea7f352797c7 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 16:50:04 -0800
Subject: [PATCH 3/9] VideoCore: Use correct register for immediate mode
 attribute count

---
 src/video_core/command_processor.cpp | 13 +++++++------
 src/video_core/pica.h                |  7 ++++++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 45b994b46..27b7a023f 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -128,17 +128,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                 g_state.input_default_attributes.attr[setup.index] = attribute;
                 setup.index++;
             } else {
-                // Put each attribute into an immediate input buffer.
-                // When all specified immediate attributes are present, the Vertex Shader is invoked
-                // and everything is
-                // sent to the primitive assembler.
+                // Put each attribute into an immediate input buffer.  When all specified immediate
+                // attributes are present, the Vertex Shader is invoked and everything is sent to
+                // the primitive assembler.
 
                 auto& immediate_input = g_state.immediate.input_vertex;
                 auto& immediate_attribute_id = g_state.immediate.current_attribute;
 
-                immediate_input.attr[immediate_attribute_id++] = attribute;
+                immediate_input.attr[immediate_attribute_id] = attribute;
 
-                if (immediate_attribute_id > regs.vs.max_input_attribute_index) {
+                if (immediate_attribute_id < regs.max_input_attrib_index) {
+                    immediate_attribute_id += 1;
+                } else {
                     MICROPROFILE_SCOPE(GPU_Drawing);
                     immediate_attribute_id = 0;
 
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 5afc9d5dd..c772896e0 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1176,7 +1176,12 @@ struct Regs {
         }
     } command_buffer;
 
-    INSERT_PADDING_WORDS(0x07);
+    INSERT_PADDING_WORDS(4);
+
+    /// Number of input attributes to the vertex shader minus 1
+    BitField<0, 4, u32> max_input_attrib_index;
+
+    INSERT_PADDING_WORDS(2);
 
     enum class GPUMode : u32 {
         Drawing = 0,

From 335df895b9f9e9760ed5cd0d6dfaea8befb94dac Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 17:25:03 -0800
Subject: [PATCH 4/9] VideoCore: Consistently use shader configuration to load
 attributes

---
 .../graphics/graphics_vertex_shader.cpp       |  6 +--
 src/video_core/command_processor.cpp          |  4 +-
 src/video_core/pica.h                         | 37 ++++---------------
 src/video_core/shader/shader.cpp              | 11 +++---
 src/video_core/shader/shader.h                |  6 +--
 src/video_core/shader/shader_interpreter.cpp  |  4 +-
 src/video_core/shader/shader_interpreter.h    |  3 +-
 7 files changed, 25 insertions(+), 46 deletions(-)

diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
index f37524190..489ec5f21 100644
--- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
@@ -511,7 +511,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
     auto& shader_config = Pica::g_state.regs.vs;
     for (auto instr : shader_setup.program_code)
         info.code.push_back({instr});
-    int num_attributes = Pica::g_state.regs.vertex_attributes.GetNumTotalAttributes();
+    int num_attributes = shader_config.max_input_attribute_index + 1;
 
     for (auto pattern : shader_setup.swizzle_data)
         info.swizzle_info.push_back({pattern});
@@ -522,11 +522,11 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
     // Generate debug information
     Pica::Shader::InterpreterEngine shader_engine;
     shader_engine.SetupBatch(shader_setup, entry_point);
-    debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, num_attributes);
+    debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, shader_config);
 
     // Reload widget state
     for (int attr = 0; attr < num_attributes; ++attr) {
-        unsigned source_attr = shader_config.input_register_map.GetRegisterForAttribute(attr);
+        unsigned source_attr = shader_config.GetRegisterForAttribute(attr);
         input_data_mapping[attr]->setText(QString("-> v%1").arg(source_attr));
         input_data_container[attr]->setVisible(true);
     }
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 27b7a023f..fef0b4ceb 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -151,7 +151,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
                                                  static_cast<void*>(&immediate_input));
                     Shader::UnitState shader_unit;
-                    shader_unit.LoadInput(immediate_input, regs.vs.max_input_attribute_index + 1);
+                    shader_unit.LoadInput(regs.vs, immediate_input);
                     shader_engine->Run(g_state.vs, shader_unit);
                     auto output_vertex = Shader::OutputVertex::FromRegisters(
                         shader_unit.registers.output, regs, regs.vs.output_mask);
@@ -288,7 +288,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                 if (g_debug_context)
                     g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
                                              (void*)&input);
-                shader_unit.LoadInput(input, loader.GetNumTotalAttributes());
+                shader_unit.LoadInput(regs.vs, input);
                 shader_engine->Run(g_state.vs, shader_unit);
 
                 // Retrieve vertex from register data
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index c772896e0..ac81a3d0f 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1225,36 +1225,15 @@ struct Regs {
         // Offset to shader program entry point (in words)
         BitField<0, 16, u32> main_offset;
 
-        union {
-            BitField<0, 4, u64> attribute0_register;
-            BitField<4, 4, u64> attribute1_register;
-            BitField<8, 4, u64> attribute2_register;
-            BitField<12, 4, u64> attribute3_register;
-            BitField<16, 4, u64> attribute4_register;
-            BitField<20, 4, u64> attribute5_register;
-            BitField<24, 4, u64> attribute6_register;
-            BitField<28, 4, u64> attribute7_register;
-            BitField<32, 4, u64> attribute8_register;
-            BitField<36, 4, u64> attribute9_register;
-            BitField<40, 4, u64> attribute10_register;
-            BitField<44, 4, u64> attribute11_register;
-            BitField<48, 4, u64> attribute12_register;
-            BitField<52, 4, u64> attribute13_register;
-            BitField<56, 4, u64> attribute14_register;
-            BitField<60, 4, u64> attribute15_register;
+        /// Maps input attributes to registers. 4-bits per attribute, specifying a register index
+        u32 input_attribute_to_register_map_low;
+        u32 input_attribute_to_register_map_high;
 
-            int GetRegisterForAttribute(int attribute_index) const {
-                u64 fields[] = {
-                    attribute0_register,  attribute1_register,  attribute2_register,
-                    attribute3_register,  attribute4_register,  attribute5_register,
-                    attribute6_register,  attribute7_register,  attribute8_register,
-                    attribute9_register,  attribute10_register, attribute11_register,
-                    attribute12_register, attribute13_register, attribute14_register,
-                    attribute15_register,
-                };
-                return (int)fields[attribute_index];
-            }
-        } input_register_map;
+        unsigned int GetRegisterForAttribute(unsigned int attribute_index) const {
+            u64 map = ((u64)input_attribute_to_register_map_high << 32) |
+                      (u64)input_attribute_to_register_map_low;
+            return (map >> (attribute_index * 4)) & 0b1111;
+        }
 
         BitField<0, 16, u32> output_mask;
 
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 971ce5b7a..dbad167e9 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -71,12 +71,13 @@ OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], co
     return ret;
 }
 
-void UnitState::LoadInput(const AttributeBuffer& input, int num_attributes) {
-    // Setup input register table
-    const auto& attribute_register_map = g_state.regs.vs.input_register_map;
+void UnitState::LoadInput(const Regs::ShaderConfig& config, const AttributeBuffer& input) {
+    const unsigned max_attribute = config.max_input_attribute_index;
 
-    for (int i = 0; i < num_attributes; i++)
-        registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
+    for (unsigned attr = 0; attr <= max_attribute; ++attr) {
+        unsigned reg = config.GetRegisterForAttribute(attr);
+        registers.input[reg] = input.attr[attr];
+    }
 }
 
 MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index cb38ec0a6..43a8b848c 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -137,10 +137,10 @@ struct UnitState {
     /**
      * Loads the unit state with an input vertex.
      *
-     * @param input Input vertex into the shader
-     * @param num_attributes The number of vertex shader attributes to load
+     * @param config Shader configuration registers corresponding to the unit.
+     * @param input Attribute buffer to load into the input registers.
      */
-    void LoadInput(const AttributeBuffer& input, int num_attributes);
+    void LoadInput(const Regs::ShaderConfig& config, const AttributeBuffer& input);
 };
 
 struct ShaderSetup {
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index d803aebbf..81522b8f5 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -669,13 +669,13 @@ void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const {
 
 DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
                                                     const AttributeBuffer& input,
-                                                    int num_attributes) const {
+                                                    const Regs::ShaderConfig& config) const {
     UnitState state;
     DebugData<true> debug_data;
 
     // Setup input register table
     boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
-    state.LoadInput(input, num_attributes);
+    state.LoadInput(config, input);
     RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
     return debug_data;
 }
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index 593e02157..d7a61e122 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -19,12 +19,11 @@ public:
     /**
      * Produce debug information based on the given shader and input vertex
      * @param input Input vertex into the shader
-     * @param num_attributes The number of vertex shader attributes
      * @param config Configuration object for the shader pipeline
      * @return Debug information for this shader with regards to the given vertex
      */
     DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const AttributeBuffer& input,
-                                     int num_attributes) const;
+                                     const Regs::ShaderConfig& config) const;
 };
 
 } // namespace

From 92bf5c88e6f85ebeef161a0056c86c66bc25c6e7 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 17:58:30 -0800
Subject: [PATCH 5/9] VideoCore: Split shader output writing from semantic
 loading

---
 src/video_core/command_processor.cpp | 14 ++++++++------
 src/video_core/shader/shader.cpp     | 29 +++++++++++++---------------
 src/video_core/shader/shader.h       |  5 +++--
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index fef0b4ceb..4955ff9f9 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -151,10 +151,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
                                                  static_cast<void*>(&immediate_input));
                     Shader::UnitState shader_unit;
+                    Shader::AttributeBuffer output{};
+
                     shader_unit.LoadInput(regs.vs, immediate_input);
                     shader_engine->Run(g_state.vs, shader_unit);
-                    auto output_vertex = Shader::OutputVertex::FromRegisters(
-                        shader_unit.registers.output, regs, regs.vs.output_mask);
+                    shader_unit.WriteOutput(regs.vs, output);
 
                     // Send to renderer
                     using Pica::Shader::OutputVertex;
@@ -163,7 +164,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
                     };
 
-                    g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
+                    g_state.primitive_assembler.SubmitVertex(
+                        Shader::OutputVertex::FromAttributeBuffer(regs, output), AddTriangle);
                 }
             }
         }
@@ -281,7 +283,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             if (!vertex_cache_hit) {
                 // Initialize data for the current vertex
-                Shader::AttributeBuffer input;
+                Shader::AttributeBuffer input, output{};
                 loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
 
                 // Send to vertex shader
@@ -290,10 +292,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                                              (void*)&input);
                 shader_unit.LoadInput(regs.vs, input);
                 shader_engine->Run(g_state.vs, shader_unit);
+                shader_unit.WriteOutput(regs.vs, output);
 
                 // Retrieve vertex from register data
-                output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output,
-                                                                    regs, regs.vs.output_mask);
+                output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs, output);
 
                 if (is_indexed) {
                     vertex_cache[vertex_cache_pos] = output_vertex;
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index dbad167e9..99a22c2dd 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -4,6 +4,7 @@
 
 #include <cmath>
 #include <cstring>
+#include "common/bit_set.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "video_core/pica.h"
@@ -19,22 +20,13 @@ namespace Pica {
 
 namespace Shader {
 
-OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs,
-                                         u32 output_mask) {
+OutputVertex OutputVertex::FromAttributeBuffer(const Regs& regs, AttributeBuffer& input) {
     // Setup output data
     OutputVertex ret;
-    // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
-    // figure out what those circumstances are and enable the remaining outputs then.
-    unsigned index = 0;
-    for (unsigned i = 0; i < 7; ++i) {
 
-        if (index >= regs.vs_output_total)
-            break;
-
-        if ((output_mask & (1 << i)) == 0)
-            continue;
-
-        const auto& output_register_map = regs.vs_output_attributes[index];
+    unsigned int num_attributes = regs.vs_output_total;
+    for (unsigned int i = 0; i < num_attributes; ++i) {
+        const auto& output_register_map = regs.vs_output_attributes[i];
 
         u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y,
                             output_register_map.map_z, output_register_map.map_w};
@@ -42,15 +34,13 @@ OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], co
         for (unsigned comp = 0; comp < 4; ++comp) {
             float24* out = ((float24*)&ret) + semantics[comp];
             if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
-                *out = output_regs[i][comp];
+                *out = input.attr[i][comp];
             } else {
                 // Zero output so that attributes which aren't output won't have denormals in them,
                 // which would slow us down later.
                 memset(out, 0, sizeof(*out));
             }
         }
-
-        index++;
     }
 
     // The hardware takes the absolute and saturates vertex colors like this, *before* doing
@@ -80,6 +70,13 @@ void UnitState::LoadInput(const Regs::ShaderConfig& config, const AttributeBuffe
     }
 }
 
+void UnitState::WriteOutput(const Regs::ShaderConfig& config, AttributeBuffer& output) {
+    unsigned int output_i = 0;
+    for (unsigned int reg : Common::BitSet<u32>(config.output_mask)) {
+        output.attr[output_i++] = registers.output[reg];
+    }
+}
+
 MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
 
 #ifdef ARCHITECTURE_x86_64
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 43a8b848c..00bd723cf 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -74,8 +74,7 @@ struct OutputVertex {
         return ret;
     }
 
-    static OutputVertex FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs,
-                                      u32 output_mask);
+    static OutputVertex FromAttributeBuffer(const Regs& regs, AttributeBuffer& output);
 };
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
@@ -141,6 +140,8 @@ struct UnitState {
      * @param input Attribute buffer to load into the input registers.
      */
     void LoadInput(const Regs::ShaderConfig& config, const AttributeBuffer& input);
+
+    void WriteOutput(const Regs::ShaderConfig& config, AttributeBuffer& output);
 };
 
 struct ShaderSetup {

From d36ec905b1d9536198e584915024ed65f0342ab2 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 21:48:12 -0800
Subject: [PATCH 6/9] Common: Optimize BitSet iterator

---
 src/common/bit_set.h | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/common/bit_set.h b/src/common/bit_set.h
index 3059d0cb0..9c2e6b28c 100644
--- a/src/common/bit_set.h
+++ b/src/common/bit_set.h
@@ -121,22 +121,19 @@ public:
     class Iterator {
     public:
         Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
-        Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+        Iterator(IntTy val) : m_val(val), m_bit(0) {}
         Iterator& operator=(Iterator other) {
             new (this) Iterator(other);
             return *this;
         }
         int operator*() {
-            return m_bit;
+            return m_bit + ComputeLsb();
         }
         Iterator& operator++() {
-            if (m_val == 0) {
-                m_bit = -1;
-            } else {
-                int bit = LeastSignificantSetBit(m_val);
-                m_val &= ~(1 << bit);
-                m_bit = bit;
-            }
+            int lsb = ComputeLsb();
+            m_val >>= lsb + 1;
+            m_bit += lsb + 1;
+            m_has_lsb = false;
             return *this;
         }
         Iterator operator++(int _) {
@@ -145,15 +142,24 @@ public:
             return other;
         }
         bool operator==(Iterator other) const {
-            return m_bit == other.m_bit;
+            return m_val == other.m_val;
         }
         bool operator!=(Iterator other) const {
-            return m_bit != other.m_bit;
+            return m_val != other.m_val;
         }
 
     private:
+        int ComputeLsb() {
+            if (!m_has_lsb) {
+                m_lsb = LeastSignificantSetBit(m_val);
+                m_has_lsb = true;
+            }
+            return m_lsb;
+        }
         IntTy m_val;
         int m_bit;
+        int m_lsb = -1;
+        bool m_has_lsb = false;
     };
 
     BitSet() : m_val(0) {}
@@ -221,11 +227,10 @@ public:
     }
 
     Iterator begin() const {
-        Iterator it(m_val, 0);
-        return ++it;
+        return Iterator(m_val);
     }
     Iterator end() const {
-        return Iterator(m_val, -1);
+        return Iterator(0);
     }
 
     IntTy m_val;

From 8ed9f9d49f716487f14736c48a7850129a5910ba Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 23:42:29 -0800
Subject: [PATCH 7/9] VideoCore/Shader: Clean up
 OutputVertex::FromAttributeBuffer

This also fixes a long-standing but neverthless harmless memory
corruption bug, whech the padding of the OutputVertex struct would get
corrupted by unused attributes.
---
 src/video_core/pica.h            |  3 ++-
 src/video_core/shader/shader.cpp | 23 ++++++++++++++---------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index ac81a3d0f..e326f7727 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -99,7 +99,8 @@ struct Regs {
             TEXCOORD1_U = 14,
             TEXCOORD1_V = 15,
 
-            // TODO: Not verified
+            TEXCOORD0_W = 16,
+
             VIEW_X = 18,
             VIEW_Y = 19,
             VIEW_Z = 20,
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 99a22c2dd..2c6e45ac4 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -22,23 +22,28 @@ namespace Shader {
 
 OutputVertex OutputVertex::FromAttributeBuffer(const Regs& regs, AttributeBuffer& input) {
     // Setup output data
-    OutputVertex ret;
+    union {
+        OutputVertex ret{};
+        std::array<float24, 24> vertex_slots;
+    };
+    static_assert(sizeof(vertex_slots) <= sizeof(ret), "Struct and array have different sizes.");
 
     unsigned int num_attributes = regs.vs_output_total;
+    ASSERT(num_attributes <= 7);
     for (unsigned int i = 0; i < num_attributes; ++i) {
         const auto& output_register_map = regs.vs_output_attributes[i];
 
-        u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y,
-                            output_register_map.map_z, output_register_map.map_w};
+        Regs::VSOutputAttributes::Semantic semantics[4] = {
+            output_register_map.map_x, output_register_map.map_y, output_register_map.map_z,
+            output_register_map.map_w};
 
         for (unsigned comp = 0; comp < 4; ++comp) {
-            float24* out = ((float24*)&ret) + semantics[comp];
-            if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
+            Regs::VSOutputAttributes::Semantic semantic = semantics[comp];
+            float24* out = &vertex_slots[semantic];
+            if (semantic < vertex_slots.size()) {
                 *out = input.attr[i][comp];
-            } else {
-                // Zero output so that attributes which aren't output won't have denormals in them,
-                // which would slow us down later.
-                memset(out, 0, sizeof(*out));
+            } else if (semantic != Regs::VSOutputAttributes::INVALID) {
+                LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic);
             }
         }
     }

From dcdffabfe69d0cecd2d8c0c1f217b884b20af643 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 18 Dec 2016 23:43:37 -0800
Subject: [PATCH 8/9] VideoCore: Extract swrast-specific data from OutputVertex

---
 src/video_core/clipper.cpp       | 24 +++++++++-------
 src/video_core/rasterizer.cpp    |  7 ++---
 src/video_core/rasterizer.h      | 40 ++++++++++++++++++++++----
 src/video_core/shader/shader.cpp |  2 +-
 src/video_core/shader/shader.h   | 49 +++++++++-----------------------
 5 files changed, 64 insertions(+), 58 deletions(-)

diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 05b5cea73..0774ffc53 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -18,6 +18,8 @@
 #include "video_core/rasterizer.h"
 #include "video_core/shader/shader.h"
 
+using Pica::Rasterizer::Vertex;
+
 namespace Pica {
 
 namespace Clipper {
@@ -29,20 +31,20 @@ public:
                                                  float24::FromFloat32(0), float24::FromFloat32(0)))
         : coeffs(coeffs), bias(bias) {}
 
-    bool IsInside(const OutputVertex& vertex) const {
+    bool IsInside(const Vertex& vertex) const {
         return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
     }
 
-    bool IsOutSide(const OutputVertex& vertex) const {
+    bool IsOutSide(const Vertex& vertex) const {
         return !IsInside(vertex);
     }
 
-    OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
+    Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const {
         float24 dp = Math::Dot(v0.pos + bias, coeffs);
         float24 dp_prev = Math::Dot(v1.pos + bias, coeffs);
         float24 factor = dp_prev / (dp_prev - dp);
 
-        return OutputVertex::Lerp(factor, v0, v1);
+        return Vertex::Lerp(factor, v0, v1);
     }
 
 private:
@@ -51,7 +53,7 @@ private:
     Math::Vec4<float24> bias;
 };
 
-static void InitScreenCoordinates(OutputVertex& vtx) {
+static void InitScreenCoordinates(Vertex& vtx) {
     struct {
         float24 halfsize_x;
         float24 offset_x;
@@ -91,8 +93,8 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
     // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
     static const size_t MAX_VERTICES = 9;
-    static_vector<OutputVertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
-    static_vector<OutputVertex, MAX_VERTICES> buffer_b;
+    static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
+    static_vector<Vertex, MAX_VERTICES> buffer_b;
     auto* output_list = &buffer_a;
     auto* input_list = &buffer_b;
 
@@ -123,7 +125,7 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
         std::swap(input_list, output_list);
         output_list->clear();
 
-        const OutputVertex* reference_vertex = &input_list->back();
+        const Vertex* reference_vertex = &input_list->back();
 
         for (const auto& vertex : *input_list) {
             // NOTE: This algorithm changes vertex order in some cases!
@@ -148,9 +150,9 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     InitScreenCoordinates((*output_list)[1]);
 
     for (size_t i = 0; i < output_list->size() - 2; i++) {
-        OutputVertex& vtx0 = (*output_list)[0];
-        OutputVertex& vtx1 = (*output_list)[i + 1];
-        OutputVertex& vtx2 = (*output_list)[i + 2];
+        Vertex& vtx0 = (*output_list)[0];
+        Vertex& vtx1 = (*output_list)[i + 1];
+        Vertex& vtx2 = (*output_list)[i + 2];
 
         InitScreenCoordinates(vtx2);
 
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index b9f5d4533..0674eb85e 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -307,8 +307,8 @@ MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 24
  * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
  * culling via recursion.
  */
-static void ProcessTriangleInternal(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1,
-                                    const Shader::OutputVertex& v2, bool reversed = false) {
+static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Vertex& v2,
+                                    bool reversed = false) {
     const auto& regs = g_state.regs;
     MICROPROFILE_SCOPE(GPU_Rasterization);
 
@@ -1276,8 +1276,7 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, const Shader
     }
 }
 
-void ProcessTriangle(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1,
-                     const Shader::OutputVertex& v2) {
+void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2) {
     ProcessTriangleInternal(v0, v1, v2);
 }
 
diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h
index 6cbda3067..3a72ac343 100644
--- a/src/video_core/rasterizer.h
+++ b/src/video_core/rasterizer.h
@@ -4,16 +4,44 @@
 
 #pragma once
 
-namespace Pica {
+#include "video_core/shader/shader.h"
 
-namespace Shader {
-struct OutputVertex;
-}
+namespace Pica {
 
 namespace Rasterizer {
 
-void ProcessTriangle(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1,
-                     const Shader::OutputVertex& v2);
+struct Vertex : Shader::OutputVertex {
+    Vertex(const OutputVertex& v) : OutputVertex(v) {}
+
+    // Attributes used to store intermediate results
+    // position after perspective divide
+    Math::Vec3<float24> screenpos;
+
+    // Linear interpolation
+    // factor: 0=this, 1=vtx
+    void Lerp(float24 factor, const Vertex& vtx) {
+        pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
+
+        // TODO: Should perform perspective correct interpolation here...
+        tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
+        tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor);
+        tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
+
+        screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
+
+        color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
+    }
+
+    // Linear interpolation
+    // factor: 0=v0, 1=v1
+    static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) {
+        Vertex ret = v0;
+        ret.Lerp(factor, v1);
+        return ret;
+    }
+};
+
+void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2);
 
 } // namespace Rasterizer
 
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 2c6e45ac4..f5f7ea61d 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -26,7 +26,7 @@ OutputVertex OutputVertex::FromAttributeBuffer(const Regs& regs, AttributeBuffer
         OutputVertex ret{};
         std::array<float24, 24> vertex_slots;
     };
-    static_assert(sizeof(vertex_slots) <= sizeof(ret), "Struct and array have different sizes.");
+    static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes.");
 
     unsigned int num_attributes = regs.vs_output_total;
     ASSERT(num_attributes <= 7);
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 00bd723cf..b188d3edf 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -28,9 +28,6 @@ struct AttributeBuffer {
 };
 
 struct OutputVertex {
-    OutputVertex() = default;
-
-    // VS output attributes
     Math::Vec4<float24> pos;
     Math::Vec4<float24> quat;
     Math::Vec4<float24> color;
@@ -42,42 +39,22 @@ struct OutputVertex {
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
 
-    // Padding for optimal alignment
-    INSERT_PADDING_WORDS(4);
-
-    // Attributes used to store intermediate results
-
-    // position after perspective divide
-    Math::Vec3<float24> screenpos;
-    INSERT_PADDING_WORDS(1);
-
-    // Linear interpolation
-    // factor: 0=this, 1=vtx
-    void Lerp(float24 factor, const OutputVertex& vtx) {
-        pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
-
-        // TODO: Should perform perspective correct interpolation here...
-        tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
-        tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor);
-        tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
-
-        screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
-
-        color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
-    }
-
-    // Linear interpolation
-    // factor: 0=v0, 1=v1
-    static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) {
-        OutputVertex ret = v0;
-        ret.Lerp(factor, v1);
-        return ret;
-    }
-
     static OutputVertex FromAttributeBuffer(const Regs& regs, AttributeBuffer& output);
 };
+#define ASSERT_POS(var, pos)                                                                       \
+    static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong "       \
+                                                                        "offset.")
+ASSERT_POS(pos, Regs::VSOutputAttributes::POSITION_X);
+ASSERT_POS(quat, Regs::VSOutputAttributes::QUATERNION_X);
+ASSERT_POS(color, Regs::VSOutputAttributes::COLOR_R);
+ASSERT_POS(tc0, Regs::VSOutputAttributes::TEXCOORD0_U);
+ASSERT_POS(tc1, Regs::VSOutputAttributes::TEXCOORD1_U);
+ASSERT_POS(tc0_w, Regs::VSOutputAttributes::TEXCOORD0_W);
+ASSERT_POS(view, Regs::VSOutputAttributes::VIEW_X);
+ASSERT_POS(tc2, Regs::VSOutputAttributes::TEXCOORD2_U);
+#undef ASSERT_POS
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
-static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
+static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size");
 
 /**
  * This structure contains the state information that needs to be unique for a shader unit. The 3DS

From 37a4ea046d80973d59ddb7735a0ffbf0bfd93ad0 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Fri, 27 Jan 2017 18:10:54 -0800
Subject: [PATCH 9/9] VideoCore: Make PrimitiveAssembler const-correct

---
 src/video_core/primitive_assembly.cpp | 2 +-
 src/video_core/primitive_assembly.h   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index be7377290..e71ff5719 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -14,7 +14,7 @@ PrimitiveAssembler<VertexType>::PrimitiveAssembler(Regs::TriangleTopology topolo
     : topology(topology), buffer_index(0) {}
 
 template <typename VertexType>
-void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx,
+void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
                                                   TriangleHandler triangle_handler) {
     switch (topology) {
     // TODO: Figure out what's different with TriangleTopology::Shader.
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index 0384d5984..24da47382 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -15,7 +15,8 @@ namespace Pica {
  */
 template <typename VertexType>
 struct PrimitiveAssembler {
-    using TriangleHandler = std::function<void(VertexType& v0, VertexType& v1, VertexType& v2)>;
+    using TriangleHandler =
+        std::function<void(const VertexType& v0, const VertexType& v1, const VertexType& v2)>;
 
     PrimitiveAssembler(Regs::TriangleTopology topology = Regs::TriangleTopology::List);
 
@@ -25,7 +26,7 @@ struct PrimitiveAssembler {
      * NOTE: We could specify the triangle handler in the constructor, but this way we can
      * keep event and handler code next to each other.
      */
-    void SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler);
+    void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler);
 
     /**
      * Resets the internal state of the PrimitiveAssembler.