From 4d69286a9c5af031d4299357e720a967694f265e Mon Sep 17 00:00:00 2001
From: gdkchan <gab.dark.100@gmail.com>
Date: Sun, 11 Sep 2022 12:44:27 -0300
Subject: [PATCH] Implement VRINT (vector) Arm32 NEON instructions (#3691)

---
 ARMeilleure/Decoders/OpCodeTable.cs           |  6 +-
 ARMeilleure/Instructions/InstEmitSimdCvt32.cs | 54 ++++++++++++
 ARMeilleure/Instructions/InstName.cs          |  4 +
 Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs         | 82 ++++++++++++++++++-
 4 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index 09df31b0b..c5f86712f 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -791,7 +791,7 @@ namespace ARMeilleure.Decoders
             SetA32("<<<<01101110xxxxxxxxxx000111xxxx", InstName.Uxtb,    InstEmit32.Uxtb,    OpCode32AluUx.Create);
             SetA32("<<<<01101100xxxxxxxxxx000111xxxx", InstName.Uxtb16,  InstEmit32.Uxtb16,  OpCode32AluUx.Create);
             SetA32("<<<<01101111xxxxxxxxxx000111xxxx", InstName.Uxth,    InstEmit32.Uxth,    OpCode32AluUx.Create);
-            
+
             // VFP
             SetVfp("<<<<11101x110000xxxx101x11x0xxxx", InstName.Vabs,   InstEmit32.Vabs_S,   OpCode32SimdS.Create,           OpCode32SimdS.CreateT32);
             SetVfp("<<<<11100x11xxxxxxxx101xx0x0xxxx", InstName.Vadd,   InstEmit32.Vadd_S,   OpCode32SimdRegS.Create,        OpCode32SimdRegS.CreateT32);
@@ -959,6 +959,10 @@ namespace ARMeilleure.Decoders
             SetA32("111100100x00xxxxxxxx1111xxx1xxxx", InstName.Vrecps,      InstEmit32.Vrecps,      OpCode32SimdReg.Create);
             SetA32("111100111x11xx00xxxx000<<xx0xxxx", InstName.Vrev,        InstEmit32.Vrev,        OpCode32SimdRev.Create);
             SetA32("1111001x0x<<xxxxxxxx0001xxx0xxxx", InstName.Vrhadd,      InstEmit32.Vrhadd,      OpCode32SimdReg.Create);
+            SetA32("111100111x111010xxxx01010xx0xxxx", InstName.Vrinta,      InstEmit32.Vrinta_V,    OpCode32SimdCmpZ.Create);
+            SetA32("111100111x111010xxxx01101xx0xxxx", InstName.Vrintm,      InstEmit32.Vrintm_V,    OpCode32SimdCmpZ.Create);
+            SetA32("111100111x111010xxxx01000xx0xxxx", InstName.Vrintn,      InstEmit32.Vrintn_V,    OpCode32SimdCmpZ.Create);
+            SetA32("111100111x111010xxxx01111xx0xxxx", InstName.Vrintp,      InstEmit32.Vrintp_V,    OpCode32SimdCmpZ.Create);
             SetA32("1111001x1x>>>xxxxxxx0010>xx1xxxx", InstName.Vrshr,       InstEmit32.Vrshr,       OpCode32SimdShImm.Create);
             SetA32("111100101x>>>xxxxxxx100001x1xxx0", InstName.Vrshrn,      InstEmit32.Vrshrn,      OpCode32SimdShImmNarrow.Create);
             SetA32("111100111x111011xxxx010x1xx0xxxx", InstName.Vrsqrte,     InstEmit32.Vrsqrte,     OpCode32SimdSqrte.Create);
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
index ec1ead486..b06ddd5e7 100644
--- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
@@ -323,6 +323,60 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        // VRINTA (vector).
+        public static void Vrinta_V(ArmEmitterContext context)
+        {
+            EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, m));
+        }
+
+        // VRINTM (vector).
+        public static void Vrintm_V(ArmEmitterContext context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitVectorUnaryOpSimd32(context, (m) =>
+                {
+                    return context.AddIntrinsic(Intrinsic.X86Roundps, m, Const(X86GetRoundControl(FPRoundingMode.TowardsMinusInfinity)));
+                });
+            }
+            else
+            {
+                EmitVectorUnaryOpF32(context, (m) => EmitUnaryMathCall(context, nameof(Math.Floor), m));
+            }
+        }
+
+        // VRINTN (vector).
+        public static void Vrintn_V(ArmEmitterContext context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitVectorUnaryOpSimd32(context, (m) =>
+                {
+                    return context.AddIntrinsic(Intrinsic.X86Roundps, m, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                });
+            }
+            else
+            {
+                EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.ToEven, m));
+            }
+        }
+
+        // VRINTP (vector).
+        public static void Vrintp_V(ArmEmitterContext context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitVectorUnaryOpSimd32(context, (m) =>
+                {
+                    return context.AddIntrinsic(Intrinsic.X86Roundps, m, Const(X86GetRoundControl(FPRoundingMode.TowardsPlusInfinity)));
+                });
+            }
+            else
+            {
+                EmitVectorUnaryOpF32(context, (m) => EmitUnaryMathCall(context, nameof(Math.Ceiling), m));
+            }
+        }
+
         // VRINTZ (floating-point).
         public static void Vrint_Z(ArmEmitterContext context)
         {
diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs
index f2c95ae98..f7022f8ef 100644
--- a/ARMeilleure/Instructions/InstName.cs
+++ b/ARMeilleure/Instructions/InstName.cs
@@ -636,6 +636,10 @@ namespace ARMeilleure.Instructions
         Vrev,
         Vrhadd,
         Vrint,
+        Vrinta,
+        Vrintm,
+        Vrintn,
+        Vrintp,
         Vrintx,
         Vrshr,
         Vrshrn,
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
index 395f24643..78d5c3cc2 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
@@ -13,6 +13,16 @@ namespace Ryujinx.Tests.Cpu
 #if SimdCvt32
 
 #region "ValueSource (Opcodes)"
+        private static uint[] _Vrint_AMNP_V_F32_()
+        {
+            return new uint[]
+            {
+                0xf3ba0500u, // VRINTA.F32 Q0, Q0
+                0xf3ba0680u, // VRINTM.F32 Q0, Q0
+                0xf3ba0400u, // VRINTN.F32 Q0, Q0
+                0xf3ba0780u  // VRINTP.F32 Q0, Q0
+            };
+        }
 #endregion
 
 #region "ValueSource (Types)"
@@ -64,6 +74,47 @@ namespace Ryujinx.Tests.Cpu
             }
         }
 
+        private static IEnumerable<ulong> _2S_F_()
+        {
+            yield return 0xFF7FFFFFFF7FFFFFul; // -Max Normal    (float.MinValue)
+            yield return 0x8080000080800000ul; // -Min Normal
+            yield return 0x807FFFFF807FFFFFul; // -Max Subnormal
+            yield return 0x8000000180000001ul; // -Min Subnormal (-float.Epsilon)
+            yield return 0x7F7FFFFF7F7FFFFFul; // +Max Normal    (float.MaxValue)
+            yield return 0x0080000000800000ul; // +Min Normal
+            yield return 0x007FFFFF007FFFFFul; // +Max Subnormal
+            yield return 0x0000000100000001ul; // +Min Subnormal (float.Epsilon)
+
+            if (!NoZeros)
+            {
+                yield return 0x8000000080000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0xFF800000FF800000ul; // -Infinity
+                yield return 0x7F8000007F800000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0xFFC00000FFC00000ul; // -QNaN (all zeros payload) (float.NaN)
+                yield return 0xFFBFFFFFFFBFFFFFul; // -SNaN (all ones  payload)
+                yield return 0x7FC000007FC00000ul; // +QNaN (all zeros payload) (-float.NaN) (DefaultNaN)
+                yield return 0x7FBFFFFF7FBFFFFFul; // +SNaN (all ones  payload)
+            }
+
+            for (int cnt = 1; cnt <= RndCnt; cnt++)
+            {
+                ulong rnd1 = GenNormalS();
+                ulong rnd2 = GenSubnormalS();
+
+                yield return (rnd1 << 32) | rnd1;
+                yield return (rnd2 << 32) | rnd2;
+            }
+        }
+
         private static IEnumerable<ulong> _1D_F_()
         {
             yield return 0xFFEFFFFFFFFFFFFFul; // -Max Normal    (double.MinValue)
@@ -224,6 +275,35 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise] [Explicit]
+        public void Vrint_AMNP_V_F32([ValueSource(nameof(_Vrint_AMNP_V_F32_))] uint opcode,
+                                     [Values(0u, 1u, 2u, 3u)] uint rd,
+                                     [Values(0u, 1u, 2u, 3u)] uint rm,
+                                     [ValueSource(nameof(_2S_F_))] ulong d0,
+                                     [ValueSource(nameof(_2S_F_))] ulong d1,
+                                     [ValueSource(nameof(_2S_F_))] ulong d2,
+                                     [ValueSource(nameof(_2S_F_))] ulong d3,
+                                     [Values] bool q)
+        {
+            if (q)
+            {
+                opcode |= 1 << 6;
+
+                rd >>= 1; rd <<= 1;
+                rm >>= 1; rm <<= 1;
+            }
+
+            opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            opcode |= ((rm & 0xf) << 0)  | ((rm & 0x10) << 1);
+
+            V128 v0 = MakeVectorE0E1(d0, d1);
+            V128 v1 = MakeVectorE0E1(d2, d3);
+
+            SingleOpcode(opcode, v0: v0, v1: v1);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("VRINTX.F<size> <Sd>, <Sm>")]
         public void Vrintx_S([Values(0u, 1u)] uint rd,
                              [Values(0u, 1u)] uint rm,
@@ -253,7 +333,7 @@ namespace Ryujinx.Tests.Cpu
             }
 
             opcode |= ((size & 3) << 8);
-            
+
             int fpscr = (int)rMode << (int)Fpcr.RMode;
             SingleOpcode(opcode, v0: v0, v1: v1, v2: v2, fpscr: fpscr);