From 8e1adb95cf7f67b976f105f4cac26d3ff2986057 Mon Sep 17 00:00:00 2001
From: mpnico <mpnico@gmail.com>
Date: Thu, 26 Aug 2021 23:50:28 +0200
Subject: [PATCH] Add support for HLE macros and accelerate
 MultiDrawElementsIndirectCount #2 (#2557)

* Add support for HLE macros and accelerate MultiDrawElementsIndirectCount

* Add missing barrier

* Fix index buffer count

* Add support check for each macro hle before use

* Add missing xml doc

Co-authored-by: gdkchan <gab.dark.100@gmail.com>
---
 Ryujinx.Common/Hash128.cs                     |   6 +
 Ryujinx.Graphics.GAL/Capabilities.cs          |   3 +
 Ryujinx.Graphics.GAL/IPipeline.cs             |   5 +
 .../Engine/GPFifo/GPFifoClass.cs              |   9 +-
 .../Engine/GPFifo/GPFifoDevice.cs             |  30 +++-
 .../Engine/GPFifo/GPFifoProcessor.cs          |  19 ++-
 Ryujinx.Graphics.Gpu/Engine/MME/IMacroEE.cs   |  29 +++-
 Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs      |  51 +++++--
 Ryujinx.Graphics.Gpu/Engine/MME/MacroHLE.cs   | 142 ++++++++++++++++++
 .../Engine/MME/MacroHLEFunctionName.cs        |  11 ++
 .../Engine/MME/MacroHLETable.cs               |  89 +++++++++++
 .../Engine/MME/MacroInterpreter.cs            |   8 +-
 Ryujinx.Graphics.Gpu/Engine/MME/MacroJit.cs   |   2 +-
 .../Engine/MME/MacroJitContext.cs             |   8 +-
 .../Engine/Threed/DrawManager.cs              |  59 ++++++++
 .../Engine/Threed/ThreedClass.cs              |  21 +++
 Ryujinx.Graphics.Gpu/GraphicsConfig.cs        |   5 +
 Ryujinx.Graphics.Gpu/Memory/BufferCache.cs    |  12 ++
 Ryujinx.Graphics.OpenGL/HwCapabilities.cs     |   2 +
 Ryujinx.Graphics.OpenGL/Pipeline.cs           |  58 ++++++-
 Ryujinx.Graphics.OpenGL/Renderer.cs           |   1 +
 Ryujinx.Graphics.OpenGL/VertexArray.cs        |  22 ++-
 22 files changed, 552 insertions(+), 40 deletions(-)
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/MME/MacroHLE.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/MME/MacroHLEFunctionName.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/MME/MacroHLETable.cs

diff --git a/Ryujinx.Common/Hash128.cs b/Ryujinx.Common/Hash128.cs
index 99cd015c4..04457bd0d 100644
--- a/Ryujinx.Common/Hash128.cs
+++ b/Ryujinx.Common/Hash128.cs
@@ -9,6 +9,12 @@ namespace Ryujinx.Common
         public ulong Low;
         public ulong High;
 
+        public Hash128(ulong low, ulong high)
+        {
+            Low = low;
+            High = high;
+        }
+
         public override string ToString()
         {
             return $"{High:x16}{Low:x16}";
diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs
index 937c3f5b7..78a995542 100644
--- a/Ryujinx.Graphics.GAL/Capabilities.cs
+++ b/Ryujinx.Graphics.GAL/Capabilities.cs
@@ -11,6 +11,7 @@ namespace Ryujinx.Graphics.GAL
         public bool SupportsNonConstantTextureOffset { get; }
         public bool SupportsTextureShadowLod { get; }
         public bool SupportsViewportSwizzle { get; }
+        public bool SupportsIndirectParameters { get; }
 
         public int MaximumComputeSharedMemorySize { get; }
         public float MaximumSupportedAnisotropy { get; }
@@ -25,6 +26,7 @@ namespace Ryujinx.Graphics.GAL
             bool supportsNonConstantTextureOffset,
             bool supportsTextureShadowLod,
             bool supportsViewportSwizzle,
+            bool supportsIndirectParameters,
             int maximumComputeSharedMemorySize,
             float maximumSupportedAnisotropy,
             int storageBufferOffsetAlignment)
@@ -37,6 +39,7 @@ namespace Ryujinx.Graphics.GAL
             SupportsNonConstantTextureOffset = supportsNonConstantTextureOffset;
             SupportsTextureShadowLod = supportsTextureShadowLod;
             SupportsViewportSwizzle = supportsViewportSwizzle;
+            SupportsIndirectParameters = supportsIndirectParameters;
             MaximumComputeSharedMemorySize = maximumComputeSharedMemorySize;
             MaximumSupportedAnisotropy = maximumSupportedAnisotropy;
             StorageBufferOffsetAlignment = storageBufferOffsetAlignment;
diff --git a/Ryujinx.Graphics.GAL/IPipeline.cs b/Ryujinx.Graphics.GAL/IPipeline.cs
index b2f9d5cbb..a5af6391a 100644
--- a/Ryujinx.Graphics.GAL/IPipeline.cs
+++ b/Ryujinx.Graphics.GAL/IPipeline.cs
@@ -19,6 +19,8 @@ namespace Ryujinx.Graphics.GAL
             int   stencilValue,
             int   stencilMask);
 
+        void CommandBufferBarrier();
+
         void CopyBuffer(BufferHandle source, BufferHandle destination, int srcOffset, int dstOffset, int size);
 
         void DispatchCompute(int groupsX, int groupsY, int groupsZ);
@@ -33,6 +35,9 @@ namespace Ryujinx.Graphics.GAL
 
         void EndTransformFeedback();
 
+        void MultiDrawIndirectCount(BufferRange indirectBuffer, BufferRange parameterBuffer, int maxDrawCount, int stride);
+        void MultiDrawIndexedIndirectCount(BufferRange indirectBuffer, BufferRange parameterBuffer, int maxDrawCount, int stride);
+
         void SetAlphaTest(bool enable, float reference, CompareOp op);
 
         void SetBlendState(int index, BlendDescriptor blend);
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs
index 28822f4e1..fe49b0f27 100644
--- a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs
@@ -161,6 +161,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
         /// <param name="argument">Method call argument</param>
         public void SetReference(int argument)
         {
+            _context.Renderer.Pipeline.CommandBufferBarrier();
+
             _context.CreateHostSyncIfNeeded();
         }
 
@@ -195,10 +197,11 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
         /// Pushes an argument to a macro.
         /// </summary>
         /// <param name="index">Index of the macro</param>
+        /// <param name="gpuVa">GPU virtual address where the command word is located</param>
         /// <param name="argument">Argument to be pushed to the macro</param>
-        public void MmePushArgument(int index, int argument)
+        public void MmePushArgument(int index, ulong gpuVa, int argument)
         {
-            _macros[index].PushArgument(argument);
+            _macros[index].PushArgument(gpuVa, argument);
         }
 
         /// <summary>
@@ -208,7 +211,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
         /// <param name="argument">Initial argument passed to the macro</param>
         public void MmeStart(int index, int argument)
         {
-            _macros[index].StartExecution(argument);
+            _macros[index].StartExecution(_context, _parent, _macroCode, argument);
         }
 
         /// <summary>
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs
index ada3bc4b7..b3de738d6 100644
--- a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs
@@ -54,11 +54,12 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
             /// <summary>
             /// Fetch the command buffer.
             /// </summary>
-            public void Fetch(MemoryManager memoryManager)
+            /// <param name="flush">If true, flushes potential GPU written data before reading the command buffer</param>
+            public void Fetch(MemoryManager memoryManager, bool flush = true)
             {
                 if (Words == null)
                 {
-                    Words = MemoryMarshal.Cast<byte, int>(memoryManager.GetSpan(EntryAddress, (int)EntryCount * 4, true)).ToArray();
+                    Words = MemoryMarshal.Cast<byte, int>(memoryManager.GetSpan(EntryAddress, (int)EntryCount * 4, flush)).ToArray();
                 }
             }
         }
@@ -73,6 +74,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
         private readonly AutoResetEvent _event;
 
         private bool _interrupt;
+        private int _flushSkips;
 
         /// <summary>
         /// Creates a new instance of the GPU General Purpose FIFO device.
@@ -188,8 +190,16 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
             // Process command buffers.
             while (_ibEnable && !_interrupt && _commandBufferQueue.TryDequeue(out CommandBuffer entry))
             {
+                bool flushCommandBuffer = true;
+
+                if (_flushSkips != 0)
+                {
+                    _flushSkips--;
+                    flushCommandBuffer = false;
+                }
+
                 _currentCommandBuffer = entry;
-                _currentCommandBuffer.Fetch(entry.Processor.MemoryManager);
+                _currentCommandBuffer.Fetch(entry.Processor.MemoryManager, flushCommandBuffer);
 
                 // If we are changing the current channel,
                 // we need to force all the host state to be updated.
@@ -199,12 +209,24 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
                     entry.Processor.ForceAllDirty();
                 }
 
-                entry.Processor.Process(_currentCommandBuffer.Words);
+                entry.Processor.Process(entry.EntryAddress, _currentCommandBuffer.Words);
             }
 
             _interrupt = false;
         }
 
+        /// <summary>
+        /// Sets the number of flushes that should be skipped for subsequent command buffers.
+        /// </summary>
+        /// <remarks>
+        /// This can improve performance when command buffer data only needs to be consumed by the GPU.
+        /// </remarks>
+        /// <param name="count">The amount of flushes that should be skipped</param>
+        internal void SetFlushSkips(int count)
+        {
+            _flushSkips = count;
+        }
+
         /// <summary>
         /// Interrupts command processing. This will break out of the DispatchCalls loop.
         /// </summary>
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs
index ea34d6cd1..096b795c6 100644
--- a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs
@@ -28,6 +28,11 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
         /// </summary>
         public MemoryManager MemoryManager => _channel.MemoryManager;
 
+        /// <summary>
+        /// 3D Engine.
+        /// </summary>
+        public ThreedClass ThreedClass => _3dClass;
+
         /// <summary>
         /// Internal GPFIFO state.
         /// </summary>
@@ -70,13 +75,16 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
         /// <summary>
         /// Processes a command buffer.
         /// </summary>
+        /// <param name="baseGpuVa">Base GPU virtual address of the command buffer</param>
         /// <param name="commandBuffer">Command buffer</param>
-        public void Process(ReadOnlySpan<int> commandBuffer)
+        public void Process(ulong baseGpuVa, ReadOnlySpan<int> commandBuffer)
         {
             for (int index = 0; index < commandBuffer.Length; index++)
             {
                 int command = commandBuffer[index];
 
+                ulong gpuVa = baseGpuVa + (ulong)index * 4;
+
                 if (_state.MethodCount != 0)
                 {
                     if (TryFastI2mBufferUpdate(commandBuffer, ref index))
@@ -84,7 +92,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
                         continue;
                     }
 
-                    Send(_state.Method, command, _state.SubChannel, _state.MethodCount <= 1);
+                    Send(gpuVa, _state.Method, command, _state.SubChannel, _state.MethodCount <= 1);
 
                     if (!_state.NonIncrementing)
                     {
@@ -120,7 +128,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
                             _state.NonIncrementing = meth.SecOp == SecOp.NonIncMethod;
                             break;
                         case SecOp.ImmdDataMethod:
-                            Send(meth.MethodAddress, meth.ImmdData, meth.MethodSubchannel, true);
+                            Send(gpuVa, meth.MethodAddress, meth.ImmdData, meth.MethodSubchannel, true);
                             break;
                     }
                 }
@@ -198,8 +206,9 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
         /// <summary>
         /// Sends a uncompressed method for processing by the graphics pipeline.
         /// </summary>
+        /// <param name="gpuVa">GPU virtual address where the command word is located</param>
         /// <param name="meth">Method to be processed</param>
-        private void Send(int offset, int argument, int subChannel, bool isLastCall)
+        private void Send(ulong gpuVa, int offset, int argument, int subChannel, bool isLastCall)
         {
             if (offset < 0x60)
             {
@@ -243,7 +252,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
 
                     if ((offset & 1) != 0)
                     {
-                        _fifoClass.MmePushArgument(macroIndex, argument);
+                        _fifoClass.MmePushArgument(macroIndex, gpuVa, argument);
                     }
                     else
                     {
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/IMacroEE.cs b/Ryujinx.Graphics.Gpu/Engine/MME/IMacroEE.cs
index b957de08d..640687f00 100644
--- a/Ryujinx.Graphics.Gpu/Engine/MME/IMacroEE.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/IMacroEE.cs
@@ -4,6 +4,33 @@ using System.Collections.Generic;
 
 namespace Ryujinx.Graphics.Gpu.Engine.MME
 {
+    /// <summary>
+    /// FIFO word.
+    /// </summary>
+    struct FifoWord
+    {
+        /// <summary>
+        /// GPU virtual address where the word is located in memory.
+        /// </summary>
+        public ulong GpuVa { get; }
+
+        /// <summary>
+        /// Word value.
+        /// </summary>
+        public int Word { get; }
+
+        /// <summary>
+        /// Creates a new FIFO word.
+        /// </summary>
+        /// <param name="gpuVa">GPU virtual address where the word is located in memory</param>
+        /// <param name="word">Word value</param>
+        public FifoWord(ulong gpuVa, int word)
+        {
+            GpuVa = gpuVa;
+            Word = word;
+        }
+    }
+
     /// <summary>
     /// Macro Execution Engine interface.
     /// </summary>
@@ -12,7 +39,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// <summary>
         /// Arguments FIFO.
         /// </summary>
-        Queue<int> Fifo { get; }
+        Queue<FifoWord> Fifo { get; }
 
         /// <summary>
         /// Should execute the GPU Macro code being passed.
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs b/Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs
index 1a79afb93..9d1dbc8fa 100644
--- a/Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs
@@ -1,4 +1,6 @@
 using Ryujinx.Graphics.Device;
+using Ryujinx.Graphics.Gpu.Engine.GPFifo;
+using Ryujinx.Graphics.Gpu.Memory;
 using System;
 
 namespace Ryujinx.Graphics.Gpu.Engine.MME
@@ -13,10 +15,10 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// </summary>
         public int Position { get; }
 
+        private IMacroEE _executionEngine;
         private bool _executionPending;
         private int _argument;
-
-        private readonly IMacroEE _executionEngine;
+        private MacroHLEFunctionName _hleFunction;
 
         /// <summary>
         /// Creates a new instance of the GPU cached macro program.
@@ -26,28 +28,47 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         {
             Position = position;
 
+            _executionEngine = null;
             _executionPending = false;
             _argument = 0;
-
-            if (GraphicsConfig.EnableMacroJit)
-            {
-                _executionEngine = new MacroJit();
-            }
-            else
-            {
-                _executionEngine = new MacroInterpreter();
-            }
+            _hleFunction = MacroHLEFunctionName.None;
         }
 
         /// <summary>
         /// Sets the first argument for the macro call.
         /// </summary>
+        /// <param name="context">GPU context where the macro code is being executed</param>
+        /// <param name="processor">GPU GP FIFO command processor</param>
+        /// <param name="code">Code to be executed</param>
         /// <param name="argument">First argument</param>
-        public void StartExecution(int argument)
+        public void StartExecution(GpuContext context, GPFifoProcessor processor, ReadOnlySpan<int> code, int argument)
         {
             _argument = argument;
 
             _executionPending = true;
+
+            if (_executionEngine == null)
+            {
+                if (GraphicsConfig.EnableMacroHLE && MacroHLETable.TryGetMacroHLEFunction(code.Slice(Position), context.Capabilities, out _hleFunction))
+                {
+                    _executionEngine = new MacroHLE(processor, _hleFunction);
+                }
+                else if (GraphicsConfig.EnableMacroJit)
+                {
+                    _executionEngine = new MacroJit();
+                }
+                else
+                {
+                    _executionEngine = new MacroInterpreter();
+                }
+            }
+
+            if (_hleFunction == MacroHLEFunctionName.MultiDrawElementsIndirectCount)
+            {
+                // We don't consume the parameter buffer value, so we don't need to flush it.
+                // Doing so improves performance if the value was written by a GPU shader.
+                context.GPFifo.SetFlushSkips(2);
+            }
         }
 
         /// <summary>
@@ -60,7 +81,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
             if (_executionPending)
             {
                 _executionPending = false;
-
                 _executionEngine?.Execute(code.Slice(Position), state, _argument);
             }
         }
@@ -68,10 +88,11 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// <summary>
         /// Pushes an argument to the macro call argument FIFO.
         /// </summary>
+        /// <param name="gpuVa">GPU virtual address where the command word is located</param>
         /// <param name="argument">Argument to be pushed</param>
-        public void PushArgument(int argument)
+        public void PushArgument(ulong gpuVa, int argument)
         {
-            _executionEngine?.Fifo.Enqueue(argument);
+            _executionEngine?.Fifo.Enqueue(new FifoWord(gpuVa, argument));
         }
     }
 }
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLE.cs b/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLE.cs
new file mode 100644
index 000000000..77b44e814
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLE.cs
@@ -0,0 +1,142 @@
+using Ryujinx.Common.Logging;
+using Ryujinx.Graphics.Device;
+using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.Gpu.Engine.GPFifo;
+using Ryujinx.Graphics.Gpu.Engine.Threed;
+using Ryujinx.Graphics.Gpu.Memory;
+using System;
+using System.Collections.Generic;
+
+namespace Ryujinx.Graphics.Gpu.Engine.MME
+{
+    /// <summary>
+    /// Macro High-level emulation.
+    /// </summary>
+    class MacroHLE : IMacroEE
+    {
+        private readonly GPFifoProcessor _processor;
+        private readonly MacroHLEFunctionName _functionName;
+
+        /// <summary>
+        /// Arguments FIFO.
+        /// </summary>
+        public Queue<FifoWord> Fifo { get; }
+
+        /// <summary>
+        /// Creates a new instance of the HLE macro handler.
+        /// </summary>
+        /// <param name="context">GPU context the macro is being executed on</param>
+        /// <param name="memoryManager">GPU memory manager</param>
+        /// <param name="engine">3D engine where this macro is being called</param>
+        /// <param name="functionName">Name of the HLE macro function to be called</param>
+        public MacroHLE(GPFifoProcessor processor, MacroHLEFunctionName functionName)
+        {
+            _processor = processor;
+            _functionName = functionName;
+
+            Fifo = new Queue<FifoWord>();
+        }
+
+        /// <summary>
+        /// Executes a macro program until it exits.
+        /// </summary>
+        /// <param name="code">Code of the program to execute</param>
+        /// <param name="state">GPU state at the time of the call</param>
+        /// <param name="arg0">Optional argument passed to the program, 0 if not used</param>
+        public void Execute(ReadOnlySpan<int> code, IDeviceState state, int arg0)
+        {
+            switch (_functionName)
+            {
+                case MacroHLEFunctionName.MultiDrawElementsIndirectCount:
+                    MultiDrawElementsIndirectCount(state, arg0);
+                    break;
+                default:
+                    throw new NotImplementedException(_functionName.ToString());
+            }
+        }
+
+        /// <summary>
+        /// Performs a indirect multi-draw, with parameters from a GPU buffer.
+        /// </summary>
+        /// <param name="state">GPU state at the time of the call</param>
+        /// <param name="arg0">First argument of the call</param>
+        private void MultiDrawElementsIndirectCount(IDeviceState state, int arg0)
+        {
+            int arg1 = FetchParam().Word;
+            int arg2 = FetchParam().Word;
+            int arg3 = FetchParam().Word;
+
+            int startOffset = arg0;
+            int endOffset = arg1;
+            var topology = (PrimitiveTopology)arg2;
+            int paddingWords = arg3;
+            int maxDrawCount = endOffset - startOffset;
+            int stride = paddingWords * 4 + 0x14;
+            int indirectBufferSize = maxDrawCount * stride;
+
+            ulong parameterBufferGpuVa = FetchParam().GpuVa;
+            ulong indirectBufferGpuVa = 0;
+
+            int indexCount = 0;
+
+            for (int i = 0; i < maxDrawCount; i++)
+            {
+                var count = FetchParam();
+                var instanceCount = FetchParam();
+                var firstIndex = FetchParam();
+                var baseVertex = FetchParam();
+                var baseInstance = FetchParam();
+
+                if (i == 0)
+                {
+                    indirectBufferGpuVa = count.GpuVa;
+                }
+
+                indexCount = Math.Max(indexCount, count.Word + firstIndex.Word);
+
+                if (i != maxDrawCount - 1)
+                {
+                    for (int j = 0; j < paddingWords; j++)
+                    {
+                        FetchParam();
+                    }
+                }
+            }
+
+            // It should be empty at this point, but clear it just to be safe.
+            Fifo.Clear();
+
+            var parameterBuffer = _processor.MemoryManager.Physical.BufferCache.GetGpuBufferRange(_processor.MemoryManager, parameterBufferGpuVa, 4);
+            var indirectBuffer = _processor.MemoryManager.Physical.BufferCache.GetGpuBufferRange(_processor.MemoryManager, indirectBufferGpuVa, (ulong)indirectBufferSize);
+
+            _processor.ThreedClass.MultiDrawIndirectCount(indexCount, topology, indirectBuffer, parameterBuffer, maxDrawCount, stride);
+        }
+
+        /// <summary>
+        /// Fetches a arguments from the arguments FIFO.
+        /// </summary>
+        /// <returns>The call argument, or a 0 value with null address if the FIFO is empty</returns>
+        private FifoWord FetchParam()
+        {
+            if (!Fifo.TryDequeue(out var value))
+            {
+                Logger.Warning?.Print(LogClass.Gpu, "Macro attempted to fetch an inexistent argument.");
+
+                return new FifoWord(0UL, 0);
+            }
+
+            return value;
+        }
+
+        /// <summary>
+        /// Performs a GPU method call.
+        /// </summary>
+        /// <param name="state">Current GPU state</param>
+        /// <param name="methAddr">Address, in words, of the method</param>
+        /// <param name="value">Call argument</param>
+        private static void Send(IDeviceState state, int methAddr, int value)
+        {
+            state.Write(methAddr * 4, value);
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLEFunctionName.cs b/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLEFunctionName.cs
new file mode 100644
index 000000000..60354a9bc
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLEFunctionName.cs
@@ -0,0 +1,11 @@
+namespace Ryujinx.Graphics.Gpu.Engine.MME
+{
+    /// <summary>
+    /// Name of the High-level implementation of a Macro function.
+    /// </summary>
+    enum MacroHLEFunctionName
+    {
+        None,
+        MultiDrawElementsIndirectCount
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLETable.cs b/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLETable.cs
new file mode 100644
index 000000000..77d041adf
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/MacroHLETable.cs
@@ -0,0 +1,89 @@
+using Ryujinx.Common;
+using Ryujinx.Graphics.GAL;
+using System;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Gpu.Engine.MME
+{
+    /// <summary>
+    /// Table with information about High-level implementations of GPU Macro code.
+    /// </summary>
+    static class MacroHLETable
+    {
+        /// <summary>
+        /// Macroo High-level implementation table entry.
+        /// </summary>
+        struct TableEntry
+        {
+            /// <summary>
+            /// Name of the Macro function.
+            /// </summary>
+            public MacroHLEFunctionName Name { get; }
+
+            /// <summary>
+            /// Hash of the original binary Macro function code.
+            /// </summary>
+            public Hash128 Hash { get; }
+
+            /// <summary>
+            /// Size (in bytes) of the original binary Macro function code.
+            /// </summary>
+            public int Length { get; }
+
+            /// <summary>
+            /// Creates a new table entry.
+            /// </summary>
+            /// <param name="name">Name of the Macro function</param>
+            /// <param name="hash">Hash of the original binary Macro function code</param>
+            /// <param name="length">Size (in bytes) of the original binary Macro function code</param>
+            public TableEntry(MacroHLEFunctionName name, Hash128 hash, int length)
+            {
+                Name = name;
+                Hash = hash;
+                Length = length;
+            }
+        }
+
+        private static readonly TableEntry[] Table = new TableEntry[]
+        {
+            new TableEntry(MacroHLEFunctionName.MultiDrawElementsIndirectCount, new Hash128(0x890AF57ED3FB1C37, 0x35D0C95C61F5386F), 0x19C)
+        };
+
+        private static bool IsMacroHLESupported(Capabilities caps, MacroHLEFunctionName name)
+        {
+            if (name == MacroHLEFunctionName.MultiDrawElementsIndirectCount)
+            {
+                return caps.SupportsIndirectParameters;
+            }
+
+            return false;
+        }
+
+        /// <summary>
+        /// Checks if there's a fast, High-level implementation of the specified Macro code available.
+        /// </summary>
+        /// <param name="code">Macro code to be checked</param>
+        /// <param name="caps">Renderer capabilities to check for this macro HLE support</param>
+        /// <param name="name">Name of the function if a implementation is available and supported, otherwise <see cref="MacroHLEFunctionName.None"/></param>
+        /// <returns>True if there is a implementation available and supported, false otherwise</returns>
+        public static bool TryGetMacroHLEFunction(ReadOnlySpan<int> code, Capabilities caps, out MacroHLEFunctionName name)
+        {
+            var mc = MemoryMarshal.Cast<int, byte>(code);
+
+            for (int i = 0; i < Table.Length; i++)
+            {
+                ref var entry = ref Table[i];
+
+                var hash = XXHash128.ComputeHash(mc.Slice(0, entry.Length));
+                if (hash == entry.Hash)
+                {
+                    name = entry.Name;
+                    return IsMacroHLESupported(caps, name);
+                }
+            }
+
+            name = MacroHLEFunctionName.None;
+            return false;
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/MacroInterpreter.cs b/Ryujinx.Graphics.Gpu/Engine/MME/MacroInterpreter.cs
index 0173a7fb3..df6ee040e 100644
--- a/Ryujinx.Graphics.Gpu/Engine/MME/MacroInterpreter.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/MacroInterpreter.cs
@@ -13,7 +13,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// <summary>
         /// Arguments FIFO.
         /// </summary>
-        public Queue<int> Fifo { get; }
+        public Queue<FifoWord> Fifo { get; }
 
         private int[] _gprs;
 
@@ -34,7 +34,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// </summary>
         public MacroInterpreter()
         {
-            Fifo = new Queue<int>();
+            Fifo = new Queue<FifoWord>();
 
             _gprs = new int[8];
         }
@@ -364,14 +364,14 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// <returns>The call argument, or 0 if the FIFO is empty</returns>
         private int FetchParam()
         {
-            if (!Fifo.TryDequeue(out int value))
+            if (!Fifo.TryDequeue(out var value))
             {
                 Logger.Warning?.Print(LogClass.Gpu, "Macro attempted to fetch an inexistent argument.");
 
                 return 0;
             }
 
-            return value;
+            return value.Word;
         }
 
         /// <summary>
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/MacroJit.cs b/Ryujinx.Graphics.Gpu/Engine/MME/MacroJit.cs
index f0393dd14..4077f74ec 100644
--- a/Ryujinx.Graphics.Gpu/Engine/MME/MacroJit.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/MacroJit.cs
@@ -14,7 +14,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// <summary>
         /// Arguments FIFO.
         /// </summary>
-        public Queue<int> Fifo => _context.Fifo;
+        public Queue<FifoWord> Fifo => _context.Fifo;
 
         private MacroJitCompiler.MacroExecute _execute;
 
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/MacroJitContext.cs b/Ryujinx.Graphics.Gpu/Engine/MME/MacroJitContext.cs
index aa31c9ee4..52c2a11b2 100644
--- a/Ryujinx.Graphics.Gpu/Engine/MME/MacroJitContext.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/MacroJitContext.cs
@@ -12,22 +12,22 @@ namespace Ryujinx.Graphics.Gpu.Engine.MME
         /// <summary>
         /// Arguments FIFO.
         /// </summary>
-        public Queue<int> Fifo { get; } = new Queue<int>();
+        public Queue<FifoWord> Fifo { get; } = new Queue<FifoWord>();
 
         /// <summary>
         /// Fetches a arguments from the arguments FIFO.
         /// </summary>
-        /// <returns></returns>
+        /// <returns>The call argument, or 0 if the FIFO is empty</returns>
         public int FetchParam()
         {
-            if (!Fifo.TryDequeue(out int value))
+            if (!Fifo.TryDequeue(out var value))
             {
                 Logger.Warning?.Print(LogClass.Gpu, "Macro attempted to fetch an inexistent argument.");
 
                 return 0;
             }
 
-            return value;
+            return value.Word;
         }
 
         /// <summary>
diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/DrawManager.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/DrawManager.cs
index d58f175db..2443917c5 100644
--- a/Ryujinx.Graphics.Gpu/Engine/Threed/DrawManager.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/Threed/DrawManager.cs
@@ -26,6 +26,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
 
         private int _instanceIndex;
 
+        private const int IndexBufferCountMethodOffset = 0x5f8;
+
         /// <summary>
         /// Creates a new instance of the draw manager.
         /// </summary>
@@ -304,6 +306,63 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
             _drawState.DrawIndexed = oldDrawIndexed;
         }
 
+        /// <summary>
+        /// Performs a indirect multi-draw, with parameters from a GPU buffer.
+        /// </summary>
+        /// <param name="engine">3D engine where this method is being called</param>
+        /// <param name="topology">Primitive topology</param>
+        /// <param name="indirectBuffer">GPU buffer with the draw parameters, such as count, first index, etc</param>
+        /// <param name="parameterBuffer">GPU buffer with the draw count</param>
+        /// <param name="maxDrawCount">Maximum number of draws that can be made</param>
+        /// <param name="stride">Distance in bytes between each element on the <paramref name="indirectBuffer"/> array</param>
+        public void MultiDrawIndirectCount(
+            ThreedClass engine,
+            int indexCount,
+            PrimitiveTopology topology,
+            BufferRange indirectBuffer,
+            BufferRange parameterBuffer,
+            int maxDrawCount,
+            int stride)
+        {
+            engine.Write(IndexBufferCountMethodOffset * 4, indexCount);
+
+            _context.Renderer.Pipeline.SetPrimitiveTopology(topology);
+            _drawState.Topology = topology;
+
+            ConditionalRenderEnabled renderEnable = ConditionalRendering.GetRenderEnable(
+                _context,
+                _channel.MemoryManager,
+                _state.State.RenderEnableAddress,
+                _state.State.RenderEnableCondition);
+
+            if (renderEnable == ConditionalRenderEnabled.False)
+            {
+                _drawState.DrawIndexed = false;
+                return;
+            }
+
+            _drawState.FirstIndex = _state.State.IndexBufferState.First;
+            _drawState.IndexCount = indexCount;
+
+            engine.UpdateState();
+
+            if (_drawState.DrawIndexed)
+            {
+                _context.Renderer.Pipeline.MultiDrawIndexedIndirectCount(indirectBuffer, parameterBuffer, maxDrawCount, stride);
+            }
+            else
+            {
+                _context.Renderer.Pipeline.MultiDrawIndirectCount(indirectBuffer, parameterBuffer, maxDrawCount, stride);
+            }
+
+            _drawState.DrawIndexed = false;
+
+            if (renderEnable == ConditionalRenderEnabled.Host)
+            {
+                _context.Renderer.Pipeline.EndHostConditionalRendering();
+            }
+        }
+
         /// <summary>
         /// Perform any deferred draws.
         /// This is used for instanced draws.
diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs
index 37c8fec2e..3d02af96b 100644
--- a/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs
@@ -1,4 +1,5 @@
 using Ryujinx.Graphics.Device;
+using Ryujinx.Graphics.GAL;
 using Ryujinx.Graphics.Gpu.Engine.InlineToMemory;
 using System;
 using System.Collections.Generic;
@@ -433,5 +434,25 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
         {
             return 0;
         }
+
+        /// <summary>
+        /// Performs a indirect multi-draw, with parameters from a GPU buffer.
+        /// </summary>
+        /// <param name="indexCount">Index Buffer Count</param>
+        /// <param name="topology">Primitive topology</param>
+        /// <param name="indirectBuffer">GPU buffer with the draw parameters, such as count, first index, etc</param>
+        /// <param name="parameterBuffer">GPU buffer with the draw count</param>
+        /// <param name="maxDrawCount">Maximum number of draws that can be made</param>
+        /// <param name="stride">Distance in bytes between each element on the <paramref name="indirectBuffer"/> array</param>
+        public void MultiDrawIndirectCount(
+            int indexCount,
+            PrimitiveTopology topology,
+            BufferRange indirectBuffer,
+            BufferRange parameterBuffer,
+            int maxDrawCount,
+            int stride)
+        {
+            _drawManager.MultiDrawIndirectCount(this, indexCount, topology, indirectBuffer, parameterBuffer, maxDrawCount, stride);
+        }
     }
 }
diff --git a/Ryujinx.Graphics.Gpu/GraphicsConfig.cs b/Ryujinx.Graphics.Gpu/GraphicsConfig.cs
index 7ef102e2c..d58b8da79 100644
--- a/Ryujinx.Graphics.Gpu/GraphicsConfig.cs
+++ b/Ryujinx.Graphics.Gpu/GraphicsConfig.cs
@@ -33,6 +33,11 @@ namespace Ryujinx.Graphics.Gpu
         /// </summary>
         public static bool EnableMacroJit = true;
 
+        /// <summary>
+        /// Enables or disables high-level emulation of common GPU Macro code.
+        /// </summary>
+        public static bool EnableMacroHLE = true;
+
         /// <summary>
         /// Title id of the current running game.
         /// Used by the shader cache.
diff --git a/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs b/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs
index 58dd838e5..63d221508 100644
--- a/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs
+++ b/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs
@@ -297,6 +297,18 @@ namespace Ryujinx.Graphics.Gpu.Memory
             buffer.SignalModified(address, size);
         }
 
+        /// <summary>
+        /// Gets a buffer sub-range for a given GPU memory range.
+        /// </summary>
+        /// <param name="memoryManager">GPU memory manager where the buffer is mapped</param>
+        /// <param name="gpuVa">Start GPU virtual address of the buffer</param>
+        /// <param name="size">Size in bytes of the buffer</param>
+        /// <returns>The buffer sub-range for the given range</returns>
+        public BufferRange GetGpuBufferRange(MemoryManager memoryManager, ulong gpuVa, ulong size)
+        {
+            return GetBufferRange(TranslateAndCreateBuffer(memoryManager, gpuVa, size), size);
+        }
+
         /// <summary>
         /// Gets a buffer sub-range starting at a given memory address.
         /// </summary>
diff --git a/Ryujinx.Graphics.OpenGL/HwCapabilities.cs b/Ryujinx.Graphics.OpenGL/HwCapabilities.cs
index 44365ca75..dd917b7b2 100644
--- a/Ryujinx.Graphics.OpenGL/HwCapabilities.cs
+++ b/Ryujinx.Graphics.OpenGL/HwCapabilities.cs
@@ -13,6 +13,7 @@ namespace Ryujinx.Graphics.OpenGL
         private static readonly Lazy<bool> _supportsSeamlessCubemapPerTexture = new Lazy<bool>(() => HasExtension("GL_ARB_seamless_cubemap_per_texture"));
         private static readonly Lazy<bool> _supportsTextureShadowLod          = new Lazy<bool>(() => HasExtension("GL_EXT_texture_shadow_lod"));
         private static readonly Lazy<bool> _supportsViewportSwizzle           = new Lazy<bool>(() => HasExtension("GL_NV_viewport_swizzle"));
+        private static readonly Lazy<bool> _supportsIndirectParameters        = new Lazy<bool>(() => HasExtension("GL_ARB_indirect_parameters"));
 
         private static readonly Lazy<int> _maximumComputeSharedMemorySize = new Lazy<int>(() => GetLimit(All.MaxComputeSharedMemorySize));
         private static readonly Lazy<int> _storageBufferOffsetAlignment   = new Lazy<int>(() => GetLimit(All.ShaderStorageBufferOffsetAlignment));
@@ -46,6 +47,7 @@ namespace Ryujinx.Graphics.OpenGL
         public static bool SupportsSeamlessCubemapPerTexture => _supportsSeamlessCubemapPerTexture.Value;
         public static bool SupportsTextureShadowLod          => _supportsTextureShadowLod.Value;
         public static bool SupportsViewportSwizzle           => _supportsViewportSwizzle.Value;
+        public static bool SupportsIndirectParameters        => _supportsIndirectParameters.Value;
 
         public static bool SupportsMismatchingViewFormat    => _gpuVendor.Value != GpuVendor.AmdWindows && _gpuVendor.Value != GpuVendor.IntelWindows;
         public static bool SupportsNonConstantTextureOffset => _gpuVendor.Value == GpuVendor.Nvidia;
diff --git a/Ryujinx.Graphics.OpenGL/Pipeline.cs b/Ryujinx.Graphics.OpenGL/Pipeline.cs
index be526fa99..24dd97f99 100644
--- a/Ryujinx.Graphics.OpenGL/Pipeline.cs
+++ b/Ryujinx.Graphics.OpenGL/Pipeline.cs
@@ -166,6 +166,11 @@ namespace Ryujinx.Graphics.OpenGL
             }
         }
 
+        public void CommandBufferBarrier()
+        {
+            GL.MemoryBarrier(MemoryBarrierFlags.CommandBarrierBit);
+        }
+
         public void CopyBuffer(BufferHandle source, BufferHandle destination, int srcOffset, int dstOffset, int size)
         {
             Buffer.Copy(source, destination, srcOffset, dstOffset, size);
@@ -543,6 +548,57 @@ namespace Ryujinx.Graphics.OpenGL
             _tfEnabled = false;
         }
 
+        public void MultiDrawIndirectCount(BufferRange indirectBuffer, BufferRange parameterBuffer, int maxDrawCount, int stride)
+        {
+            if (!_program.IsLinked)
+            {
+                Logger.Debug?.Print(LogClass.Gpu, "Draw error, shader not linked.");
+                return;
+            }
+
+            PreDraw();
+
+            GL.BindBuffer((BufferTarget)All.DrawIndirectBuffer, indirectBuffer.Handle.ToInt32());
+            GL.BindBuffer((BufferTarget)All.ParameterBuffer, parameterBuffer.Handle.ToInt32());
+
+            GL.MultiDrawArraysIndirectCount(
+                _primitiveType,
+                (IntPtr)indirectBuffer.Offset,
+                (IntPtr)parameterBuffer.Offset,
+                maxDrawCount,
+                stride);
+
+            PostDraw();
+        }
+
+        public void MultiDrawIndexedIndirectCount(BufferRange indirectBuffer, BufferRange parameterBuffer, int maxDrawCount, int stride)
+        {
+            if (!_program.IsLinked)
+            {
+                Logger.Debug?.Print(LogClass.Gpu, "Draw error, shader not linked.");
+                return;
+            }
+
+            PreDraw();
+
+            _vertexArray.SetRangeOfIndexBuffer();
+
+            GL.BindBuffer((BufferTarget)All.DrawIndirectBuffer, indirectBuffer.Handle.ToInt32());
+            GL.BindBuffer((BufferTarget)All.ParameterBuffer, parameterBuffer.Handle.ToInt32());
+
+            GL.MultiDrawElementsIndirectCount(
+                _primitiveType,
+                (Version46)_elementsType,
+                (IntPtr)indirectBuffer.Offset,
+                (IntPtr)parameterBuffer.Offset,
+                maxDrawCount,
+                stride);
+
+            _vertexArray.RestoreIndexBuffer();
+
+            PostDraw();
+        }
+
         public void SetAlphaTest(bool enable, float reference, CompareOp op)
         {
             if (!enable)
@@ -741,7 +797,7 @@ namespace Ryujinx.Graphics.OpenGL
 
             EnsureVertexArray();
 
-            _vertexArray.SetIndexBuffer(buffer.Handle);
+            _vertexArray.SetIndexBuffer(buffer);
         }
 
         public void SetLogicOpState(bool enable, LogicalOp op)
diff --git a/Ryujinx.Graphics.OpenGL/Renderer.cs b/Ryujinx.Graphics.OpenGL/Renderer.cs
index 01072176e..6b620bb8e 100644
--- a/Ryujinx.Graphics.OpenGL/Renderer.cs
+++ b/Ryujinx.Graphics.OpenGL/Renderer.cs
@@ -107,6 +107,7 @@ namespace Ryujinx.Graphics.OpenGL
                 HwCapabilities.SupportsNonConstantTextureOffset,
                 HwCapabilities.SupportsTextureShadowLod,
                 HwCapabilities.SupportsViewportSwizzle,
+                HwCapabilities.SupportsIndirectParameters,
                 HwCapabilities.MaximumComputeSharedMemorySize,
                 HwCapabilities.MaximumSupportedAnisotropy,
                 HwCapabilities.StorageBufferOffsetAlignment);
diff --git a/Ryujinx.Graphics.OpenGL/VertexArray.cs b/Ryujinx.Graphics.OpenGL/VertexArray.cs
index f2fcba1f0..bdf14481e 100644
--- a/Ryujinx.Graphics.OpenGL/VertexArray.cs
+++ b/Ryujinx.Graphics.OpenGL/VertexArray.cs
@@ -20,12 +20,17 @@ namespace Ryujinx.Graphics.OpenGL
         private uint _vertexAttribsInUse;
         private uint _vertexBuffersInUse;
 
+        private BufferRange _indexBuffer;
+        private BufferHandle _tempIndexBuffer;
+
         public VertexArray()
         {
             Handle = GL.GenVertexArray();
 
             _vertexAttribs = new VertexAttribDescriptor[Constants.MaxVertexAttribs];
             _vertexBuffers = new VertexBufferDescriptor[Constants.MaxVertexBuffers];
+
+            _tempIndexBuffer = Buffer.Create();
         }
 
         public void Bind()
@@ -120,9 +125,22 @@ namespace Ryujinx.Graphics.OpenGL
             }
         }
 
-        public void SetIndexBuffer(BufferHandle buffer)
+        public void SetIndexBuffer(BufferRange range)
         {
-            GL.BindBuffer(BufferTarget.ElementArrayBuffer, buffer.ToInt32());
+            _indexBuffer = range;
+            GL.BindBuffer(BufferTarget.ElementArrayBuffer, range.Handle.ToInt32());
+        }
+
+        public void SetRangeOfIndexBuffer()
+        {
+            Buffer.Resize(_tempIndexBuffer, _indexBuffer.Size);
+            Buffer.Copy(_indexBuffer.Handle, _tempIndexBuffer, _indexBuffer.Offset, 0, _indexBuffer.Size);
+            GL.BindBuffer(BufferTarget.ElementArrayBuffer, _tempIndexBuffer.ToInt32());
+        }
+
+        public void RestoreIndexBuffer()
+        {
+            GL.BindBuffer(BufferTarget.ElementArrayBuffer, _indexBuffer.Handle.ToInt32());
         }
 
         public void Validate()