From c7387be0d296f54a6ad5678d25e2c0d4910b7da4 Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Mon, 17 Sep 2018 06:54:05 +0200
Subject: [PATCH] Fix/Add 1+12 [Saturating] [Rounded] Shift Right Narrow (imm.)
 Instructions; add 14 Tests. Add 6 Tests for PR#405. Add 2 Tests for PR#412.
 (#409)

* Update AOpCodeTable.cs

* Update AInstEmitSimdShift.cs

* Update CpuTestSimdShImm.cs

* Update AInstEmitSimdArithmetic.cs

* Update AInstEmitSimdHelper.cs

* Create CpuTestSimdIns.cs

* Update CpuTest.cs

* Update CpuTestSimd.cs

* Update CpuTestSimdReg.cs

* Update CpuTest.cs

* Update CpuTestSimd.cs

* Update CpuTestSimdReg.cs

* Update CpuTestSimd.cs

* Update CpuTestSimdReg.cs

* Update CpuTest.cs

* Update CpuTestSimdReg.cs

* Update CpuTestSimd.cs
---
 ChocolArm64/AOpCodeTable.cs                   |  12 +
 .../Instruction/AInstEmitSimdArithmetic.cs    |  12 +-
 .../Instruction/AInstEmitSimdHelper.cs        |  46 +-
 ChocolArm64/Instruction/AInstEmitSimdShift.cs | 243 +++++++---
 Ryujinx.Tests/Cpu/CpuTest.cs                  | 125 +++++-
 Ryujinx.Tests/Cpu/CpuTestSimd.cs              | 420 ++++++++++--------
 Ryujinx.Tests/Cpu/CpuTestSimdIns.cs           |  74 +++
 Ryujinx.Tests/Cpu/CpuTestSimdReg.cs           | 307 ++++++++++++-
 Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs         | 318 +++++++++++++
 9 files changed, 1254 insertions(+), 303 deletions(-)
 create mode 100644 Ryujinx.Tests/Cpu/CpuTestSimdIns.cs

diff --git a/ChocolArm64/AOpCodeTable.cs b/ChocolArm64/AOpCodeTable.cs
index b053334f3..fe3dce41d 100644
--- a/ChocolArm64/AOpCodeTable.cs
+++ b/ChocolArm64/AOpCodeTable.cs
@@ -364,6 +364,7 @@ namespace ChocolArm64
             SetA64("0x00111000100000000110xxxxxxxxxx", AInstEmit.Rev16_V,       typeof(AOpCodeSimd));
             SetA64("0x1011100x100000000010xxxxxxxxxx", AInstEmit.Rev32_V,       typeof(AOpCodeSimd));
             SetA64("0x001110<<100000000010xxxxxxxxxx", AInstEmit.Rev64_V,       typeof(AOpCodeSimd));
+            SetA64("0x00111100>>>xxx100011xxxxxxxxxx", AInstEmit.Rshrn_V,       typeof(AOpCodeSimdShImm));
             SetA64("0x101110<<1xxxxx011000xxxxxxxxxx", AInstEmit.Rsubhn_V,      typeof(AOpCodeSimdReg));
             SetA64("0x001110<<1xxxxx011111xxxxxxxxxx", AInstEmit.Saba_V,        typeof(AOpCodeSimdReg));
             SetA64("0x001110<<1xxxxx010100xxxxxxxxxx", AInstEmit.Sabal_V,       typeof(AOpCodeSimdReg));
@@ -409,7 +410,14 @@ namespace ChocolArm64
             SetA64("01111110101xxxxx101101xxxxxxxxxx", AInstEmit.Sqrdmulh_S,    typeof(AOpCodeSimdReg));
             SetA64("0x101110011xxxxx101101xxxxxxxxxx", AInstEmit.Sqrdmulh_V,    typeof(AOpCodeSimdReg));
             SetA64("0x101110101xxxxx101101xxxxxxxxxx", AInstEmit.Sqrdmulh_V,    typeof(AOpCodeSimdReg));
+            SetA64("0101111100>>>xxx100111xxxxxxxxxx", AInstEmit.Sqrshrn_S,     typeof(AOpCodeSimdShImm));
             SetA64("0x00111100>>>xxx100111xxxxxxxxxx", AInstEmit.Sqrshrn_V,     typeof(AOpCodeSimdShImm));
+            SetA64("0111111100>>>xxx100011xxxxxxxxxx", AInstEmit.Sqrshrun_S,    typeof(AOpCodeSimdShImm));
+            SetA64("0x10111100>>>xxx100011xxxxxxxxxx", AInstEmit.Sqrshrun_V,    typeof(AOpCodeSimdShImm));
+            SetA64("0101111100>>>xxx100101xxxxxxxxxx", AInstEmit.Sqshrn_S,      typeof(AOpCodeSimdShImm));
+            SetA64("0x00111100>>>xxx100101xxxxxxxxxx", AInstEmit.Sqshrn_V,      typeof(AOpCodeSimdShImm));
+            SetA64("0111111100>>>xxx100001xxxxxxxxxx", AInstEmit.Sqshrun_S,     typeof(AOpCodeSimdShImm));
+            SetA64("0x10111100>>>xxx100001xxxxxxxxxx", AInstEmit.Sqshrun_V,     typeof(AOpCodeSimdShImm));
             SetA64("01011110xx1xxxxx001011xxxxxxxxxx", AInstEmit.Sqsub_S,       typeof(AOpCodeSimdReg));
             SetA64("0>001110<<1xxxxx001011xxxxxxxxxx", AInstEmit.Sqsub_V,       typeof(AOpCodeSimdReg));
             SetA64("01011110<<100001010010xxxxxxxxxx", AInstEmit.Sqxtn_S,       typeof(AOpCodeSimd));
@@ -476,6 +484,10 @@ namespace ChocolArm64
             SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", AInstEmit.Umull_V,       typeof(AOpCodeSimdReg));
             SetA64("01111110xx1xxxxx000011xxxxxxxxxx", AInstEmit.Uqadd_S,       typeof(AOpCodeSimdReg));
             SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", AInstEmit.Uqadd_V,       typeof(AOpCodeSimdReg));
+            SetA64("0111111100>>>xxx100111xxxxxxxxxx", AInstEmit.Uqrshrn_S,     typeof(AOpCodeSimdShImm));
+            SetA64("0x10111100>>>xxx100111xxxxxxxxxx", AInstEmit.Uqrshrn_V,     typeof(AOpCodeSimdShImm));
+            SetA64("0111111100>>>xxx100101xxxxxxxxxx", AInstEmit.Uqshrn_S,      typeof(AOpCodeSimdShImm));
+            SetA64("0x10111100>>>xxx100101xxxxxxxxxx", AInstEmit.Uqshrn_V,      typeof(AOpCodeSimdShImm));
             SetA64("01111110xx1xxxxx001011xxxxxxxxxx", AInstEmit.Uqsub_S,       typeof(AOpCodeSimdReg));
             SetA64("0>101110<<1xxxxx001011xxxxxxxxxx", AInstEmit.Uqsub_V,       typeof(AOpCodeSimdReg));
             SetA64("01111110<<100001010010xxxxxxxxxx", AInstEmit.Uqxtn_S,       typeof(AOpCodeSimd));
diff --git a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
index b9aedd07b..27a86d84c 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
@@ -1199,22 +1199,22 @@ namespace ChocolArm64.Instruction
 
         public static void Sqxtn_S(AILEmitterCtx Context)
         {
-            EmitScalarSaturatingNarrowOpSxSx(Context, () => { });
+            EmitSaturatingNarrowOp(Context, SaturatingNarrowFlags.ScalarSxSx);
         }
 
         public static void Sqxtn_V(AILEmitterCtx Context)
         {
-            EmitVectorSaturatingNarrowOpSxSx(Context, () => { });
+            EmitSaturatingNarrowOp(Context, SaturatingNarrowFlags.VectorSxSx);
         }
 
         public static void Sqxtun_S(AILEmitterCtx Context)
         {
-            EmitScalarSaturatingNarrowOpSxZx(Context, () => { });
+            EmitSaturatingNarrowOp(Context, SaturatingNarrowFlags.ScalarSxZx);
         }
 
         public static void Sqxtun_V(AILEmitterCtx Context)
         {
-            EmitVectorSaturatingNarrowOpSxZx(Context, () => { });
+            EmitSaturatingNarrowOp(Context, SaturatingNarrowFlags.VectorSxZx);
         }
 
         public static void Srhadd_V(AILEmitterCtx Context)
@@ -1455,12 +1455,12 @@ namespace ChocolArm64.Instruction
 
         public static void Uqxtn_S(AILEmitterCtx Context)
         {
-            EmitScalarSaturatingNarrowOpZxZx(Context, () => { });
+            EmitSaturatingNarrowOp(Context, SaturatingNarrowFlags.ScalarZxZx);
         }
 
         public static void Uqxtn_V(AILEmitterCtx Context)
         {
-            EmitVectorSaturatingNarrowOpZxZx(Context, () => { });
+            EmitSaturatingNarrowOp(Context, SaturatingNarrowFlags.VectorZxZx);
         }
 
         public static void Urhadd_V(AILEmitterCtx Context)
diff --git a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
index cb884c1ac..75a5a0d09 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
@@ -1004,56 +1004,14 @@ namespace ChocolArm64.Instruction
 
             ScalarSxSx = Scalar | SignedSrc | SignedDst,
             ScalarSxZx = Scalar | SignedSrc,
-            ScalarZxSx = Scalar | SignedDst,
             ScalarZxZx = Scalar,
 
             VectorSxSx = SignedSrc | SignedDst,
             VectorSxZx = SignedSrc,
-            VectorZxSx = SignedDst,
             VectorZxZx = 0
         }
 
-        public static void EmitScalarSaturatingNarrowOpSxSx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.ScalarSxSx);
-        }
-
-        public static void EmitScalarSaturatingNarrowOpSxZx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.ScalarSxZx);
-        }
-
-        public static void EmitScalarSaturatingNarrowOpZxSx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.ScalarZxSx);
-        }
-
-        public static void EmitScalarSaturatingNarrowOpZxZx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.ScalarZxZx);
-        }
-
-        public static void EmitVectorSaturatingNarrowOpSxSx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.VectorSxSx);
-        }
-
-        public static void EmitVectorSaturatingNarrowOpSxZx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.VectorSxZx);
-        }
-
-        public static void EmitVectorSaturatingNarrowOpZxSx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.VectorZxSx);
-        }
-
-        public static void EmitVectorSaturatingNarrowOpZxZx(AILEmitterCtx Context, Action Emit)
-        {
-            EmitSaturatingNarrowOp(Context, Emit, SaturatingNarrowFlags.VectorZxZx);
-        }
-
-        public static void EmitSaturatingNarrowOp(AILEmitterCtx Context, Action Emit, SaturatingNarrowFlags Flags)
+        public static void EmitSaturatingNarrowOp(AILEmitterCtx Context, SaturatingNarrowFlags Flags)
         {
             AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
 
@@ -1080,8 +1038,6 @@ namespace ChocolArm64.Instruction
             {
                 EmitVectorExtract(Context, Op.Rn, Index, Op.Size + 1, SignedSrc);
 
-                Emit();
-
                 EmitSatQ(Context, Op.Size, SignedSrc, SignedDst);
 
                 EmitVectorInsertTmp(Context, Part + Index, Op.Size);
diff --git a/ChocolArm64/Instruction/AInstEmitSimdShift.cs b/ChocolArm64/Instruction/AInstEmitSimdShift.cs
index 4dee53b9b..127abf1df 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdShift.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdShift.cs
@@ -10,6 +10,11 @@ namespace ChocolArm64.Instruction
 {
     static partial class AInstEmit
     {
+        public static void Rshrn_V(AILEmitterCtx Context)
+        {
+            EmitVectorShrImmNarrowOpZx(Context, Round: true);
+        }
+
         public static void Shl_S(AILEmitterCtx Context)
         {
             AOpCodeSimdShImm Op = (AOpCodeSimdShImm)Context.CurrOp;
@@ -45,9 +50,7 @@ namespace ChocolArm64.Instruction
 
         public static void Shrn_V(AILEmitterCtx Context)
         {
-            AOpCodeSimdShImm Op = (AOpCodeSimdShImm)Context.CurrOp;
-
-            EmitVectorShImmNarrowBinaryZx(Context, () => Context.Emit(OpCodes.Shr_Un), GetImmShr(Op));
+            EmitVectorShrImmNarrowOpZx(Context, Round: false);
         }
 
         public static void Sli_V(AILEmitterCtx Context)
@@ -85,26 +88,44 @@ namespace ChocolArm64.Instruction
             }
         }
 
+        public static void Sqrshrn_S(AILEmitterCtx Context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
+        }
+
         public static void Sqrshrn_V(AILEmitterCtx Context)
         {
-            AOpCodeSimdShImm Op = (AOpCodeSimdShImm)Context.CurrOp;
+            EmitRoundShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.VectorSxSx);
+        }
 
-            int Shift = GetImmShr(Op);
+        public static void Sqrshrun_S(AILEmitterCtx Context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.ScalarSxZx);
+        }
 
-            long RoundConst = 1L << (Shift - 1);
+        public static void Sqrshrun_V(AILEmitterCtx Context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.VectorSxZx);
+        }
 
-            Action Emit = () =>
-            {
-                Context.EmitLdc_I8(RoundConst);
+        public static void Sqshrn_S(AILEmitterCtx Context)
+        {
+            EmitShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
+        }
 
-                Context.Emit(OpCodes.Add);
+        public static void Sqshrn_V(AILEmitterCtx Context)
+        {
+            EmitShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.VectorSxSx);
+        }
 
-                Context.EmitLdc_I4(Shift);
+        public static void Sqshrun_S(AILEmitterCtx Context)
+        {
+            EmitShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.ScalarSxZx);
+        }
 
-                Context.Emit(OpCodes.Shr);
-            };
-
-            EmitVectorSaturatingNarrowOpSxSx(Context, Emit);
+        public static void Sqshrun_V(AILEmitterCtx Context)
+        {
+            EmitShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.VectorSxZx);
         }
 
         public static void Srshr_S(AILEmitterCtx Context)
@@ -159,6 +180,26 @@ namespace ChocolArm64.Instruction
             EmitVectorShrImmOpSx(Context, ShrImmFlags.Accumulate);
         }
 
+        public static void Uqrshrn_S(AILEmitterCtx Context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
+        }
+
+        public static void Uqrshrn_V(AILEmitterCtx Context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.VectorZxZx);
+        }
+
+        public static void Uqshrn_S(AILEmitterCtx Context)
+        {
+            EmitShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
+        }
+
+        public static void Uqshrn_V(AILEmitterCtx Context)
+        {
+            EmitShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.VectorZxZx);
+        }
+
         public static void Urshr_S(AILEmitterCtx Context)
         {
             EmitScalarShrImmOpZx(Context, ShrImmFlags.Round);
@@ -367,6 +408,138 @@ namespace ChocolArm64.Instruction
             }
         }
 
+        private static void EmitVectorShrImmNarrowOpZx(AILEmitterCtx Context, bool Round)
+        {
+            AOpCodeSimdShImm Op = (AOpCodeSimdShImm)Context.CurrOp;
+
+            int Shift = GetImmShr(Op);
+
+            long RoundConst = 1L << (Shift - 1);
+
+            int Elems = 8 >> Op.Size;
+
+            int Part = Op.RegisterSize == ARegisterSize.SIMD128 ? Elems : 0;
+
+            if (Part != 0)
+            {
+                Context.EmitLdvec(Op.Rd);
+                Context.EmitStvectmp();
+            }
+
+            for (int Index = 0; Index < Elems; Index++)
+            {
+                EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1);
+
+                if (Round)
+                {
+                    Context.EmitLdc_I8(RoundConst);
+
+                    Context.Emit(OpCodes.Add);
+                }
+
+                Context.EmitLdc_I4(Shift);
+
+                Context.Emit(OpCodes.Shr_Un);
+
+                EmitVectorInsertTmp(Context, Part + Index, Op.Size);
+            }
+
+            Context.EmitLdvectmp();
+            Context.EmitStvec(Op.Rd);
+
+            if (Part == 0)
+            {
+                EmitVectorZeroUpper(Context, Op.Rd);
+            }
+        }
+
+        [Flags]
+        private enum ShrImmSaturatingNarrowFlags
+        {
+            Scalar    = 1 << 0,
+            SignedSrc = 1 << 1,
+            SignedDst = 1 << 2,
+
+            Round = 1 << 3,
+
+            ScalarSxSx = Scalar | SignedSrc | SignedDst,
+            ScalarSxZx = Scalar | SignedSrc,
+            ScalarZxZx = Scalar,
+
+            VectorSxSx = SignedSrc | SignedDst,
+            VectorSxZx = SignedSrc,
+            VectorZxZx = 0
+        }
+
+        private static void EmitRoundShrImmSaturatingNarrowOp(AILEmitterCtx Context, ShrImmSaturatingNarrowFlags Flags)
+        {
+            EmitShrImmSaturatingNarrowOp(Context, ShrImmSaturatingNarrowFlags.Round | Flags);
+        }
+
+        private static void EmitShrImmSaturatingNarrowOp(AILEmitterCtx Context, ShrImmSaturatingNarrowFlags Flags)
+        {
+            AOpCodeSimdShImm Op = (AOpCodeSimdShImm)Context.CurrOp;
+
+            bool Scalar    = (Flags & ShrImmSaturatingNarrowFlags.Scalar)    != 0;
+            bool SignedSrc = (Flags & ShrImmSaturatingNarrowFlags.SignedSrc) != 0;
+            bool SignedDst = (Flags & ShrImmSaturatingNarrowFlags.SignedDst) != 0;
+            bool Round     = (Flags & ShrImmSaturatingNarrowFlags.Round)     != 0;
+
+            int Shift = GetImmShr(Op);
+
+            long RoundConst = 1L << (Shift - 1);
+
+            int Elems = !Scalar ? 8 >> Op.Size : 1;
+
+            int Part = !Scalar && (Op.RegisterSize == ARegisterSize.SIMD128) ? Elems : 0;
+
+            if (Scalar)
+            {
+                EmitVectorZeroLowerTmp(Context);
+            }
+
+            if (Part != 0)
+            {
+                Context.EmitLdvec(Op.Rd);
+                Context.EmitStvectmp();
+            }
+
+            for (int Index = 0; Index < Elems; Index++)
+            {
+                EmitVectorExtract(Context, Op.Rn, Index, Op.Size + 1, SignedSrc);
+
+                if (Op.Size <= 1 || !Round)
+                {
+                    if (Round)
+                    {
+                        Context.EmitLdc_I8(RoundConst);
+
+                        Context.Emit(OpCodes.Add);
+                    }
+
+                    Context.EmitLdc_I4(Shift);
+
+                    Context.Emit(SignedSrc ? OpCodes.Shr : OpCodes.Shr_Un);
+                }
+                else /* if (Op.Size == 2 && Round) */
+                {
+                    EmitShrImm_64(Context, SignedSrc, RoundConst, Shift); // Shift <= 32
+                }
+
+                EmitSatQ(Context, Op.Size, SignedSrc, SignedDst);
+
+                EmitVectorInsertTmp(Context, Part + Index, Op.Size);
+            }
+
+            Context.EmitLdvectmp();
+            Context.EmitStvec(Op.Rd);
+
+            if (Part == 0)
+            {
+                EmitVectorZeroUpper(Context, Op.Rd);
+            }
+        }
+
         // Dst_64 = (Int(Src_64, Signed) + RoundConst) >> Shift;
         private static void EmitShrImm_64(
             AILEmitterCtx Context,
@@ -374,11 +547,6 @@ namespace ChocolArm64.Instruction
             long RoundConst,
             int  Shift)
         {
-            if (((AOpCodeSimd)Context.CurrOp).Size < 3)
-            {
-                throw new InvalidOperationException();
-            }
-
             Context.EmitLdc_I8(RoundConst);
             Context.EmitLdc_I4(Shift);
 
@@ -387,41 +555,6 @@ namespace ChocolArm64.Instruction
                 : nameof(ASoftFallback.UnsignedShrImm_64));
         }
 
-        private static void EmitVectorShImmNarrowBinarySx(AILEmitterCtx Context, Action Emit, int Imm)
-        {
-            EmitVectorShImmNarrowBinaryOp(Context, Emit, Imm, true);
-        }
-
-        private static void EmitVectorShImmNarrowBinaryZx(AILEmitterCtx Context, Action Emit, int Imm)
-        {
-            EmitVectorShImmNarrowBinaryOp(Context, Emit, Imm, false);
-        }
-
-        private static void EmitVectorShImmNarrowBinaryOp(AILEmitterCtx Context, Action Emit, int Imm, bool Signed)
-        {
-            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
-
-            int Elems = 8 >> Op.Size;
-
-            int Part = Op.RegisterSize == ARegisterSize.SIMD128 ? Elems : 0;
-
-            for (int Index = 0; Index < Elems; Index++)
-            {
-                EmitVectorExtract(Context, Op.Rn, Index, Op.Size + 1, Signed);
-
-                Context.EmitLdc_I4(Imm);
-
-                Emit();
-
-                EmitVectorInsert(Context, Op.Rd, Part + Index, Op.Size);
-            }
-
-            if (Part == 0)
-            {
-                EmitVectorZeroUpper(Context, Op.Rd);
-            }
-        }
-
         private static void EmitVectorShImmWidenBinarySx(AILEmitterCtx Context, Action Emit, int Imm)
         {
             EmitVectorShImmWidenBinaryOp(Context, Emit, Imm, true);
diff --git a/Ryujinx.Tests/Cpu/CpuTest.cs b/Ryujinx.Tests/Cpu/CpuTest.cs
index 1f7151ffe..c69c7a02f 100644
--- a/Ryujinx.Tests/Cpu/CpuTest.cs
+++ b/Ryujinx.Tests/Cpu/CpuTest.cs
@@ -93,6 +93,7 @@ namespace Ryujinx.Tests.Cpu
                                       Vector128<float> V0 = default(Vector128<float>),
                                       Vector128<float> V1 = default(Vector128<float>),
                                       Vector128<float> V2 = default(Vector128<float>),
+                                      Vector128<float> V3 = default(Vector128<float>),
                                       bool Overflow = false, bool Carry = false, bool Zero = false, bool Negative = false,
                                       int Fpcr = 0x0, int Fpsr = 0x0)
         {
@@ -106,6 +107,7 @@ namespace Ryujinx.Tests.Cpu
             Thread.ThreadState.V0 = V0;
             Thread.ThreadState.V1 = V1;
             Thread.ThreadState.V2 = V2;
+            Thread.ThreadState.V3 = V3;
 
             Thread.ThreadState.Overflow = Overflow;
             Thread.ThreadState.Carry    = Carry;
@@ -127,6 +129,7 @@ namespace Ryujinx.Tests.Cpu
                 UnicornEmu.Q[0] = V0;
                 UnicornEmu.Q[1] = V1;
                 UnicornEmu.Q[2] = V2;
+                UnicornEmu.Q[3] = V3;
 
                 UnicornEmu.OverflowFlag = Overflow;
                 UnicornEmu.CarryFlag    = Carry;
@@ -162,13 +165,14 @@ namespace Ryujinx.Tests.Cpu
                                             Vector128<float> V0 = default(Vector128<float>),
                                             Vector128<float> V1 = default(Vector128<float>),
                                             Vector128<float> V2 = default(Vector128<float>),
+                                            Vector128<float> V3 = default(Vector128<float>),
                                             bool Overflow = false, bool Carry = false, bool Zero = false, bool Negative = false,
                                             int Fpcr = 0x0, int Fpsr = 0x0)
         {
             this.Opcode(Opcode);
             this.Opcode(0xD4200000); // BRK #0
             this.Opcode(0xD65F03C0); // RET
-            SetThreadState(X0, X1, X2, X3, X31, V0, V1, V2, Overflow, Carry, Zero, Negative, Fpcr, Fpsr);
+            SetThreadState(X0, X1, X2, X3, X31, V0, V1, V2, V3, Overflow, Carry, Zero, Negative, Fpcr, Fpsr);
             ExecuteOpcodes();
 
             return GetThreadState();
@@ -195,13 +199,30 @@ namespace Ryujinx.Tests.Cpu
             QC  = 1 << 27
         }
 
-        protected void CompareAgainstUnicorn(FPSR FpsrMask = FPSR.None)
+        protected enum FpSkips { None, IfNaN_S, IfNaN_D };
+
+        protected enum FpUseTolerance { None, OneUlps_S, OneUlps_D };
+
+        protected void CompareAgainstUnicorn(
+            FPSR           FpsrMask       = FPSR.None,
+            FpSkips        FpSkips        = FpSkips.None,
+            FpUseTolerance FpUseTolerance = FpUseTolerance.None)
         {
             if (!UnicornAvailable)
             {
                 return;
             }
 
+            if (FpSkips == FpSkips.IfNaN_S && float.IsNaN(VectorExtractSingle(UnicornEmu.Q[0], (byte)0)))
+            {
+                Assert.Ignore("NaN test.");
+            }
+
+            if (FpSkips == FpSkips.IfNaN_D && double.IsNaN(VectorExtractDouble(UnicornEmu.Q[0], (byte)0)))
+            {
+                Assert.Ignore("NaN test.");
+            }
+
             Assert.That(Thread.ThreadState.X0,  Is.EqualTo(UnicornEmu.X[0]));
             Assert.That(Thread.ThreadState.X1,  Is.EqualTo(UnicornEmu.X[1]));
             Assert.That(Thread.ThreadState.X2,  Is.EqualTo(UnicornEmu.X[2]));
@@ -236,7 +257,51 @@ namespace Ryujinx.Tests.Cpu
 
             Assert.That(Thread.ThreadState.X31, Is.EqualTo(UnicornEmu.SP));
 
-            Assert.That(Thread.ThreadState.V0,  Is.EqualTo(UnicornEmu.Q[0]));
+            if (FpUseTolerance == FpUseTolerance.None)
+            {
+                Assert.That(Thread.ThreadState.V0, Is.EqualTo(UnicornEmu.Q[0]));
+            }
+            else
+            {
+                if (!Is.EqualTo(UnicornEmu.Q[0]).ApplyTo(Thread.ThreadState.V0).IsSuccess)
+                {
+                    if (FpUseTolerance == FpUseTolerance.OneUlps_S)
+                    {
+                        if (float.IsNormal   (VectorExtractSingle(UnicornEmu.Q[0], (byte)0)) ||
+                            float.IsSubnormal(VectorExtractSingle(UnicornEmu.Q[0], (byte)0)))
+                        {
+                            Assert.That   (VectorExtractSingle(Thread.ThreadState.V0, (byte)0),
+                                Is.EqualTo(VectorExtractSingle(UnicornEmu.Q[0],       (byte)0)).Within(1).Ulps);
+                            Assert.That   (VectorExtractSingle(Thread.ThreadState.V0, (byte)1),
+                                Is.EqualTo(VectorExtractSingle(UnicornEmu.Q[0],       (byte)1)).Within(1).Ulps);
+                            Assert.That   (VectorExtractSingle(Thread.ThreadState.V0, (byte)2),
+                                Is.EqualTo(VectorExtractSingle(UnicornEmu.Q[0],       (byte)2)).Within(1).Ulps);
+                            Assert.That   (VectorExtractSingle(Thread.ThreadState.V0, (byte)3),
+                                Is.EqualTo(VectorExtractSingle(UnicornEmu.Q[0],       (byte)3)).Within(1).Ulps);
+                        }
+                        else
+                        {
+                            Assert.That(Thread.ThreadState.V0, Is.EqualTo(UnicornEmu.Q[0]));
+                        }
+                    }
+
+                    if (FpUseTolerance == FpUseTolerance.OneUlps_D)
+                    {
+                        if (double.IsNormal   (VectorExtractDouble(UnicornEmu.Q[0], (byte)0)) ||
+                            double.IsSubnormal(VectorExtractDouble(UnicornEmu.Q[0], (byte)0)))
+                        {
+                            Assert.That   (VectorExtractDouble(Thread.ThreadState.V0, (byte)0),
+                                Is.EqualTo(VectorExtractDouble(UnicornEmu.Q[0],       (byte)0)).Within(1).Ulps);
+                            Assert.That   (VectorExtractDouble(Thread.ThreadState.V0, (byte)1),
+                                Is.EqualTo(VectorExtractDouble(UnicornEmu.Q[0],       (byte)1)).Within(1).Ulps);
+                        }
+                        else
+                        {
+                            Assert.That(Thread.ThreadState.V0, Is.EqualTo(UnicornEmu.Q[0]));
+                        }
+                    }
+                }
+            }
             Assert.That(Thread.ThreadState.V1,  Is.EqualTo(UnicornEmu.Q[1]));
             Assert.That(Thread.ThreadState.V2,  Is.EqualTo(UnicornEmu.Q[2]));
             Assert.That(Thread.ThreadState.V3,  Is.EqualTo(UnicornEmu.Q[3]));
@@ -310,6 +375,18 @@ namespace Ryujinx.Tests.Cpu
             return Sse.StaticCast<long, float>(Sse2.SetVector128(BitConverter.DoubleToInt64Bits(E1), 0));
         }
 
+        protected static float VectorExtractSingle(Vector128<float> Vector, byte Index)
+        {
+            if (!Sse41.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            int Value = Sse41.Extract(Sse.StaticCast<float, int>(Vector), Index);
+
+            return BitConverter.Int32BitsToSingle(Value);
+        }
+
         protected static double VectorExtractDouble(Vector128<float> Vector, byte Index)
         {
             if (!Sse41.IsSupported)
@@ -371,5 +448,47 @@ namespace Ryujinx.Tests.Cpu
 
             return Sse41.Extract(Sse.StaticCast<float, ulong>(Vector), (byte)1);
         }
+
+        protected static uint GenNormal_S()
+        {
+            uint Rnd;
+
+            do      Rnd = TestContext.CurrentContext.Random.NextUInt();
+            while ((Rnd & 0x7F800000u) == 0u ||
+                   (Rnd & 0x7F800000u) == 0x7F800000u);
+
+            return Rnd;
+        }
+
+        protected static uint GenSubNormal_S()
+        {
+            uint Rnd;
+
+            do      Rnd = TestContext.CurrentContext.Random.NextUInt();
+            while ((Rnd & 0x007FFFFFu) == 0u);
+
+            return Rnd & 0x807FFFFFu;
+        }
+
+        protected static ulong GenNormal_D()
+        {
+            ulong Rnd;
+
+            do      Rnd = TestContext.CurrentContext.Random.NextULong();
+            while ((Rnd & 0x7FF0000000000000ul) == 0ul ||
+                   (Rnd & 0x7FF0000000000000ul) == 0x7FF0000000000000ul);
+
+            return Rnd;
+        }
+
+        protected static ulong GenSubNormal_D()
+        {
+            ulong Rnd;
+
+            do      Rnd = TestContext.CurrentContext.Random.NextULong();
+            while ((Rnd & 0x000FFFFFFFFFFFFFul) == 0ul);
+
+            return Rnd & 0x800FFFFFFFFFFFFFul;
+        }
     }
 }
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimd.cs b/Ryujinx.Tests/Cpu/CpuTestSimd.cs
index ec0cd104f..b423b4de7 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs
@@ -4,6 +4,7 @@ using ChocolArm64.State;
 
 using NUnit.Framework;
 
+using System.Collections.Generic;
 using System.Runtime.Intrinsics;
 
 namespace Ryujinx.Tests.Cpu
@@ -13,7 +14,7 @@ namespace Ryujinx.Tests.Cpu
     {
 #if Simd
 
-#region "ValueSource"
+#region "ValueSource (Types)"
         private static ulong[] _1B1H1S1D_()
         {
             return new ulong[] { 0x0000000000000000ul, 0x000000000000007Ful,
@@ -78,78 +79,183 @@ namespace Ryujinx.Tests.Cpu
                                  0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
         }
 
-        private static ulong[] _1S_F_()
+        private static IEnumerable<ulong> _1S_F_()
         {
-            return new ulong[]
+            yield return 0x00000000FF7FFFFFul; // -Max Normal (float.MinValue)
+            yield return 0x0000000080800000ul; // -Min Normal
+            yield return 0x00000000807FFFFFul; // -Max SubNormal
+            yield return 0x0000000080000001ul; // -Min SubNormal
+            yield return 0x000000007F7FFFFFul; // +Max Normal (float.MaxValue)
+            yield return 0x0000000000800000ul; // +Min Normal
+            yield return 0x00000000007FFFFFul; // +Max SubNormal
+            yield return 0x0000000000000001ul; // +Min SubNormal
+
+            if (!NoZeros)
             {
-                0x00000000FFFFFFFFul, // -QNaN (all ones payload)
-                0x00000000FFBFFFFFul, // -SNaN (all ones payload)
-                0x00000000FF800000ul, // -INF
-                0x00000000FF7FFFFFul, // -Max Normal, float.MinValue
-                0x0000000080800000ul, // -Min Normal
-                0x00000000807FFFFFul, // -Max SubNormal
-                0x0000000080000001ul, // -Min SubNormal
-                0x0000000080000000ul, // -0
-                0x0000000000000000ul, // +0
-                0x0000000000000001ul, // +Min SubNormal
-                0x00000000007FFFFFul, // +Max SubNormal
-                0x0000000000800000ul, // +Min Normal
-                0x000000007F7FFFFFul, // +Max Normal, float.MaxValue
-                0x000000007F800000ul, // +INF
-                0x000000007FBFFFFFul, // +SNaN (all ones payload)
-                0x000000007FFFFFFFul  // +QNaN (all ones payload)
+                yield return 0x0000000080000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0x00000000FF800000ul; // -Infinity
+                yield return 0x000000007F800000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0x00000000FFFFFFFFul; // -QNaN (all ones payload)
+                yield return 0x00000000FFBFFFFFul; // -SNaN (all ones payload)
+                yield return 0x000000007FFFFFFFul; // +QNaN (all ones payload)
+                yield return 0x000000007FBFFFFFul; // +SNaN (all ones payload)
+            }
+
+            for (int Cnt = 1; Cnt <= RndCnt; Cnt++)
+            {
+                ulong Grbg = TestContext.CurrentContext.Random.NextUInt();
+                ulong Rnd1 = GenNormal_S();
+                ulong Rnd2 = GenSubNormal_S();
+
+                yield return (Grbg << 32) | Rnd1;
+                yield return (Grbg << 32) | Rnd2;
+            }
+        }
+
+        private static IEnumerable<ulong> _2S_F_()
+        {
+            yield return 0xFF7FFFFFFF7FFFFFul; // -Max Normal (float.MinValue)
+            yield return 0x8080000080800000ul; // -Min Normal
+            yield return 0x807FFFFF807FFFFFul; // -Max SubNormal
+            yield return 0x8000000180000001ul; // -Min SubNormal
+            yield return 0x7F7FFFFF7F7FFFFFul; // +Max Normal (float.MaxValue)
+            yield return 0x0080000000800000ul; // +Min Normal
+            yield return 0x007FFFFF007FFFFFul; // +Max SubNormal
+            yield return 0x0000000100000001ul; // +Min SubNormal
+
+            if (!NoZeros)
+            {
+                yield return 0x8000000080000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0xFF800000FF800000ul; // -Infinity
+                yield return 0x7F8000007F800000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0xFFFFFFFFFFFFFFFFul; // -QNaN (all ones payload)
+                yield return 0xFFBFFFFFFFBFFFFFul; // -SNaN (all ones payload)
+                yield return 0x7FFFFFFF7FFFFFFFul; // +QNaN (all ones payload)
+                yield return 0x7FBFFFFF7FBFFFFFul; // +SNaN (all ones payload)
+            }
+
+            for (int Cnt = 1; Cnt <= RndCnt; Cnt++)
+            {
+                ulong Rnd1 = GenNormal_S();
+                ulong Rnd2 = GenSubNormal_S();
+
+                yield return (Rnd1 << 32) | Rnd1;
+                yield return (Rnd2 << 32) | Rnd2;
+            }
+        }
+
+        private static IEnumerable<ulong> _1D_F_()
+        {
+            yield return 0xFFEFFFFFFFFFFFFFul; // -Max Normal (double.MinValue)
+            yield return 0x8010000000000000ul; // -Min Normal
+            yield return 0x800FFFFFFFFFFFFFul; // -Max SubNormal
+            yield return 0x8000000000000001ul; // -Min SubNormal
+            yield return 0x7FEFFFFFFFFFFFFFul; // +Max Normal (double.MaxValue)
+            yield return 0x0010000000000000ul; // +Min Normal
+            yield return 0x000FFFFFFFFFFFFFul; // +Max SubNormal
+            yield return 0x0000000000000001ul; // +Min SubNormal
+
+            if (!NoZeros)
+            {
+                yield return 0x8000000000000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0xFFF0000000000000ul; // -Infinity
+                yield return 0x7FF0000000000000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0xFFFFFFFFFFFFFFFFul; // -QNaN (all ones payload)
+                yield return 0xFFF7FFFFFFFFFFFFul; // -SNaN (all ones payload)
+                yield return 0x7FFFFFFFFFFFFFFFul; // +QNaN (all ones payload)
+                yield return 0x7FF7FFFFFFFFFFFFul; // +SNaN (all ones payload)
+            }
+
+            for (int Cnt = 1; Cnt <= RndCnt; Cnt++)
+            {
+                ulong Rnd1 = GenNormal_D();
+                ulong Rnd2 = GenSubNormal_D();
+
+                yield return Rnd1;
+                yield return Rnd2;
+            }
+        }
+#endregion
+
+#region "ValueSource (Opcodes)"
+        private static uint[] _F_Cvt_NZ_SU_S_S_()
+        {
+            return new uint[]
+            {
+                0x5E21A820u, // FCVTNS S0, S1
+                0x7E21A820u, // FCVTNU S0, S1
+                0x5EA1B820u, // FCVTZS S0, S1
+                0x7EA1B820u  // FCVTZU S0, S1
             };
         }
 
-        private static ulong[] _2S_F_()
+        private static uint[] _F_Cvt_NZ_SU_S_D_()
         {
-            return new ulong[]
+            return new uint[]
             {
-                0xFFFFFFFFFFFFFFFFul, // -QNaN (all ones payload)
-                0xFFBFFFFFFFBFFFFFul, // -SNaN (all ones payload)
-                0xFF800000FF800000ul, // -INF
-                0xFF7FFFFFFF7FFFFFul, // -Max Normal, float.MinValue
-                0x8080000080800000ul, // -Min Normal
-                0x807FFFFF807FFFFFul, // -Max SubNormal
-                0x8000000180000001ul, // -Min SubNormal
-                0x8000000080000000ul, // -0
-                0x0000000000000000ul, // +0
-                0x0000000100000001ul, // +Min SubNormal
-                0x007FFFFF007FFFFFul, // +Max SubNormal
-                0x0080000000800000ul, // +Min Normal
-                0x7F7FFFFF7F7FFFFFul, // +Max Normal, float.MaxValue
-                0x7F8000007F800000ul, // +INF
-                0x7FBFFFFF7FBFFFFFul, // +SNaN (all ones payload)
-                0x7FFFFFFF7FFFFFFFul  // +QNaN (all ones payload)
+                0x5E61A820u, // FCVTNS D0, D1
+                0x7E61A820u, // FCVTNU D0, D1
+                0x5EE1B820u, // FCVTZS D0, D1
+                0x7EE1B820u  // FCVTZU D0, D1
             };
         }
 
-        private static ulong[] _1D_F_()
+        private static uint[] _F_Cvt_NZ_SU_V_2S_4S_()
         {
-            return new ulong[]
+            return new uint[]
             {
-                0xFFFFFFFFFFFFFFFFul, // -QNaN (all ones payload)
-                0xFFF7FFFFFFFFFFFFul, // -SNaN (all ones payload)
-                0xFFF0000000000000ul, // -INF
-                0xFFEFFFFFFFFFFFFFul, // -Max Normal, double.MinValue
-                0x8010000000000000ul, // -Min Normal
-                0x800FFFFFFFFFFFFFul, // -Max SubNormal
-                0x8000000000000001ul, // -Min SubNormal
-                0x8000000000000000ul, // -0
-                0x0000000000000000ul, // +0
-                0x0000000000000001ul, // +Min SubNormal
-                0x000FFFFFFFFFFFFFul, // +Max SubNormal
-                0x0010000000000000ul, // +Min Normal
-                0x7FEFFFFFFFFFFFFFul, // +Max Normal, double.MaxValue
-                0x7FF0000000000000ul, // +INF
-                0x7FF7FFFFFFFFFFFFul, // +SNaN (all ones payload)
-                0x7FFFFFFFFFFFFFFFul  // +QNaN (all ones payload)
+                0x0E21A800u, // FCVTNS V0.2S, V0.2S
+                0x2E21A800u, // FCVTNU V0.2S, V0.2S
+                0x0EA1B800u, // FCVTZS V0.2S, V0.2S
+                0x2EA1B800u  // FCVTZU V0.2S, V0.2S
+            };
+        }
+
+        private static uint[] _F_Cvt_NZ_SU_V_2D_()
+        {
+            return new uint[]
+            {
+                0x4E61A800u, // FCVTNS V0.2D, V0.2D
+                0x6E61A800u, // FCVTNU V0.2D, V0.2D
+                0x4EE1B800u, // FCVTZS V0.2D, V0.2D
+                0x6EE1B800u  // FCVTZU V0.2D, V0.2D
             };
         }
 #endregion
 
         private const int RndCnt = 2;
 
+        private static readonly bool NoZeros = false;
+        private static readonly bool NoInfs  = false;
+        private static readonly bool NoNaNs  = false;
+
         [Test, Pairwise, Description("ABS <V><d>, <V><n>")]
         public void Abs_S_D([Values(0u)]     uint Rd,
                             [Values(1u, 0u)] uint Rn,
@@ -645,176 +751,104 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
-        [Test, Pairwise, Description("FCVTNS <V><d>, <V><n>")]
-        public void Fcvtns_S_S([Values(0u)]     uint Rd,
-                               [Values(1u, 0u)] uint Rn,
-                               [ValueSource("_1S_F_")] [Random(RndCnt)] ulong Z,
-                               [ValueSource("_1S_F_")] [Random(RndCnt)] ulong A)
+        [Test, Pairwise, Description("FCVT <Dd>, <Sn>")]
+        public void Fcvt_S_SD([ValueSource("_1S_F_")] ulong A)
         {
+            //const int DNFlagBit = 25; // Default NaN mode control bit.
             //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
 
-            uint Opcode = 0x5E21A800; // FCVTNS S0, S0
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            uint Opcode = 0x1E22C020; // FCVT D0, S1
 
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE1(Z);
+            Vector128<float> V1 = MakeVectorE0(A);
 
+            //int Fpcr  = 1 << DNFlagBit; // Any operation involving one or more NaNs returns the Default NaN.
+                //Fpcr |= 1 << FZFlagBit; // Flush-to-zero mode enabled.
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
+
+            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IOC*/);
+        }
+
+        [Test, Pairwise, Description("FCVT <Sd>, <Dn>")]
+        public void Fcvt_S_DS([ValueSource("_1D_F_")] ulong A)
+        {
+            uint Opcode = 0x1E624020; // FCVT S0, D1
+
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
             Vector128<float> V0 = MakeVectorE0E1(Z, Z);
             Vector128<float> V1 = MakeVectorE0(A);
 
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void F_Cvt_NZ_SU_S_S([ValueSource("_F_Cvt_NZ_SU_S_S_")] uint Opcodes,
+                                    [ValueSource("_1S_F_")] ulong A)
+        {
+            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
+
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+
+            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
 
             CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
         }
 
-        [Test, Pairwise, Description("FCVTNS <V><d>, <V><n>")]
-        public void Fcvtns_S_D([Values(0u)]     uint Rd,
-                               [Values(1u, 0u)] uint Rn,
-                               [ValueSource("_1D_F_")] [Random(RndCnt)] ulong Z,
-                               [ValueSource("_1D_F_")] [Random(RndCnt)] ulong A)
+        [Test, Pairwise]
+        public void F_Cvt_NZ_SU_S_D([ValueSource("_F_Cvt_NZ_SU_S_D_")] uint Opcodes,
+                                    [ValueSource("_1D_F_")] ulong A)
         {
-            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
-
-            uint Opcode = 0x5E61A800; // FCVTNS D0, D0
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
-
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
-
-            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE1(Z);
             Vector128<float> V1 = MakeVectorE0(A);
 
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
 
-            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
+            CompareAgainstUnicorn();
         }
 
-        [Test, Pairwise, Description("FCVTNS <Vd>.<T>, <Vn>.<T>")]
-        public void Fcvtns_V_2S_4S([Values(0u)]     uint Rd,
-                                   [Values(1u, 0u)] uint Rn,
-                                   [ValueSource("_2S_F_")] [Random(RndCnt)] ulong Z,
-                                   [ValueSource("_2S_F_")] [Random(RndCnt)] ulong A,
-                                   [Values(0b0u, 0b1u)] uint Q) // <2S, 4S>
+        [Test, Pairwise]
+        public void F_Cvt_NZ_SU_V_2S_4S([ValueSource("_F_Cvt_NZ_SU_V_2S_4S_")] uint Opcodes,
+                                        [Values(0u)]     uint Rd,
+                                        [Values(1u, 0u)] uint Rn,
+                                        [ValueSource("_2S_F_")] ulong Z,
+                                        [ValueSource("_2S_F_")] ulong A,
+                                        [Values(0b0u, 0b1u)] uint Q) // <2S, 4S>
         {
-            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
-
-            uint Opcode = 0x0E21A800; // FCVTNS V0.2S, V0.2S
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
-            Opcode |= ((Q & 1) << 30);
-
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= ((Q & 1) << 30);
 
             Vector128<float> V0 = MakeVectorE0E1(Z, Z);
             Vector128<float> V1 = MakeVectorE0E1(A, A * Q);
 
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
 
-            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
+            CompareAgainstUnicorn();
         }
 
-        [Test, Pairwise, Description("FCVTNS <Vd>.<T>, <Vn>.<T>")]
-        public void Fcvtns_V_2D([Values(0u)]     uint Rd,
-                                [Values(1u, 0u)] uint Rn,
-                                [ValueSource("_1D_F_")] [Random(RndCnt)] ulong Z,
-                                [ValueSource("_1D_F_")] [Random(RndCnt)] ulong A)
+        [Test, Pairwise]
+        public void F_Cvt_NZ_SU_V_2D([ValueSource("_F_Cvt_NZ_SU_V_2D_")] uint Opcodes,
+                                     [Values(0u)]     uint Rd,
+                                     [Values(1u, 0u)] uint Rn,
+                                     [ValueSource("_1D_F_")] ulong Z,
+                                     [ValueSource("_1D_F_")] ulong A)
         {
-            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
-
-            uint Opcode = 0x4E61A800; // FCVTNS V0.2D, V0.2D
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
-
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
 
             Vector128<float> V0 = MakeVectorE0E1(Z, Z);
             Vector128<float> V1 = MakeVectorE0E1(A, A);
 
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
 
-            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
-        }
-
-        [Test, Pairwise, Description("FCVTNU <V><d>, <V><n>")]
-        public void Fcvtnu_S_S([Values(0u)]     uint Rd,
-                               [Values(1u, 0u)] uint Rn,
-                               [ValueSource("_1S_F_")] [Random(RndCnt)] ulong Z,
-                               [ValueSource("_1S_F_")] [Random(RndCnt)] ulong A)
-        {
-            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
-
-            uint Opcode = 0x7E21A800; // FCVTNU S0, S0
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
-
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
-
-            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
-            Vector128<float> V1 = MakeVectorE0(A);
-
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
-
-            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
-        }
-
-        [Test, Pairwise, Description("FCVTNU <V><d>, <V><n>")]
-        public void Fcvtnu_S_D([Values(0u)]     uint Rd,
-                               [Values(1u, 0u)] uint Rn,
-                               [ValueSource("_1D_F_")] [Random(RndCnt)] ulong Z,
-                               [ValueSource("_1D_F_")] [Random(RndCnt)] ulong A)
-        {
-            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
-
-            uint Opcode = 0x7E61A800; // FCVTNU D0, D0
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
-
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
-
-            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
-            Vector128<float> V1 = MakeVectorE0(A);
-
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
-
-            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
-        }
-
-        [Test, Pairwise, Description("FCVTNU <Vd>.<T>, <Vn>.<T>")]
-        public void Fcvtnu_V_2S_4S([Values(0u)]     uint Rd,
-                                   [Values(1u, 0u)] uint Rn,
-                                   [ValueSource("_2S_F_")] [Random(RndCnt)] ulong Z,
-                                   [ValueSource("_2S_F_")] [Random(RndCnt)] ulong A,
-                                   [Values(0b0u, 0b1u)] uint Q) // <2S, 4S>
-        {
-            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
-
-            uint Opcode = 0x2E21A800; // FCVTNU V0.2S, V0.2S
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
-            Opcode |= ((Q & 1) << 30);
-
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
-
-            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
-            Vector128<float> V1 = MakeVectorE0E1(A, A * Q);
-
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
-
-            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
-        }
-
-        [Test, Pairwise, Description("FCVTNU <Vd>.<T>, <Vn>.<T>")]
-        public void Fcvtnu_V_2D([Values(0u)]     uint Rd,
-                                [Values(1u, 0u)] uint Rn,
-                                [ValueSource("_1D_F_")] [Random(RndCnt)] ulong Z,
-                                [ValueSource("_1D_F_")] [Random(RndCnt)] ulong A)
-        {
-            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
-
-            uint Opcode = 0x6E61A800; // FCVTNU V0.2D, V0.2D
-            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
-
-            //int Fpcr = 1 << FZFlagBit; // Flush-to-zero mode enabled.
-
-            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
-            Vector128<float> V1 = MakeVectorE0E1(A, A);
-
-            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1/*, Fpcr: Fpcr*/);
-
-            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IXC | FPSR.IOC*/);
+            CompareAgainstUnicorn();
         }
 
         [Test, Pairwise, Description("NEG <V><d>, <V><n>")]
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs b/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs
new file mode 100644
index 000000000..387cdf5dd
--- /dev/null
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdIns.cs
@@ -0,0 +1,74 @@
+#define SimdIns
+
+using ChocolArm64.State;
+
+using NUnit.Framework;
+
+using System.Runtime.Intrinsics;
+
+namespace Ryujinx.Tests.Cpu
+{
+    [Category("SimdIns")] // Tested: second half of 2018.
+    public sealed class CpuTestSimdIns : CpuTest
+    {
+#if SimdIns
+
+#region "ValueSource"
+        private static uint[] _W_()
+        {
+            return new uint[] { 0x00000000u, 0x0000007Fu,
+                                0x00000080u, 0x000000FFu,
+                                0x00007FFFu, 0x00008000u,
+                                0x0000FFFFu, 0x7FFFFFFFu,
+                                0x80000000u, 0xFFFFFFFFu };
+        }
+
+        private static ulong[] _X_()
+        {
+            return new ulong[] { 0x0000000000000000ul, 0x7FFFFFFFFFFFFFFFul,
+                                 0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
+        }
+#endregion
+
+        private const int RndCnt = 2;
+
+        [Test, Pairwise, Description("DUP <Vd>.<T>, <R><n>")]
+        public void Dup_Gp_W([Values(0u)]      uint Rd,
+                             [Values(1u, 31u)] uint Rn,
+                             [ValueSource("_W_")] [Random(RndCnt)] uint Wn,
+                             [Values(0, 1, 2)] int Size,  // Q0: <8B,  4H, 2S>
+                             [Values(0b0u, 0b1u)] uint Q) // Q1: <16B, 8H, 4S>
+        {
+            uint Imm5 = (1U << Size) & 0x1F;
+
+            uint Opcode = 0x0E000C00; // RESERVED
+            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcode |= (Imm5 << 16);
+            Opcode |= ((Q & 1) << 30);
+
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, X1: Wn, V0: V0);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise, Description("DUP <Vd>.<T>, <R><n>")]
+        public void Dup_Gp_X([Values(0u)]      uint Rd,
+                             [Values(1u, 31u)] uint Rn,
+                             [ValueSource("_X_")] [Random(RndCnt)] ulong Xn)
+        {
+            uint Opcode = 0x4E080C00; // DUP V0.2D, X0
+            Opcode |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, X1: Xn, V0: V0);
+
+            CompareAgainstUnicorn();
+        }
+#endif
+    }
+}
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
index b7150db30..ae409a6d8 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
@@ -4,6 +4,7 @@ using ChocolArm64.State;
 
 using NUnit.Framework;
 
+using System.Collections.Generic;
 using System.Runtime.Intrinsics;
 
 namespace Ryujinx.Tests.Cpu
@@ -13,7 +14,7 @@ namespace Ryujinx.Tests.Cpu
     {
 #if SimdReg
 
-#region "ValueSource"
+#region "ValueSource (Types)"
         private static ulong[] _1B1H1S1D_()
         {
             return new ulong[] { 0x0000000000000000ul, 0x000000000000007Ful,
@@ -76,10 +77,188 @@ namespace Ryujinx.Tests.Cpu
                                  0x8000000080000000ul, 0x7FFFFFFFFFFFFFFFul,
                                  0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
         }
+
+        private static IEnumerable<ulong> _1S_F_()
+        {
+            yield return 0x00000000FF7FFFFFul; // -Max Normal (float.MinValue)
+            yield return 0x0000000080800000ul; // -Min Normal
+            yield return 0x00000000807FFFFFul; // -Max SubNormal
+            yield return 0x0000000080000001ul; // -Min SubNormal
+            yield return 0x000000007F7FFFFFul; // +Max Normal (float.MaxValue)
+            yield return 0x0000000000800000ul; // +Min Normal
+            yield return 0x00000000007FFFFFul; // +Max SubNormal
+            yield return 0x0000000000000001ul; // +Min SubNormal
+
+            if (!NoZeros)
+            {
+                yield return 0x0000000080000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0x00000000FF800000ul; // -Infinity
+                yield return 0x000000007F800000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0x00000000FFFFFFFFul; // -QNaN (all ones payload)
+                yield return 0x00000000FFBFFFFFul; // -SNaN (all ones payload)
+                yield return 0x000000007FFFFFFFul; // +QNaN (all ones payload)
+                yield return 0x000000007FBFFFFFul; // +SNaN (all ones payload)
+            }
+
+            for (int Cnt = 1; Cnt <= RndCnt; Cnt++)
+            {
+                ulong Grbg = TestContext.CurrentContext.Random.NextUInt();
+                ulong Rnd1 = GenNormal_S();
+                ulong Rnd2 = GenSubNormal_S();
+
+                yield return (Grbg << 32) | Rnd1;
+                yield return (Grbg << 32) | Rnd2;
+            }
+        }
+
+        private static IEnumerable<ulong> _2S_F_()
+        {
+            yield return 0xFF7FFFFFFF7FFFFFul; // -Max Normal (float.MinValue)
+            yield return 0x8080000080800000ul; // -Min Normal
+            yield return 0x807FFFFF807FFFFFul; // -Max SubNormal
+            yield return 0x8000000180000001ul; // -Min SubNormal
+            yield return 0x7F7FFFFF7F7FFFFFul; // +Max Normal (float.MaxValue)
+            yield return 0x0080000000800000ul; // +Min Normal
+            yield return 0x007FFFFF007FFFFFul; // +Max SubNormal
+            yield return 0x0000000100000001ul; // +Min SubNormal
+
+            if (!NoZeros)
+            {
+                yield return 0x8000000080000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0xFF800000FF800000ul; // -Infinity
+                yield return 0x7F8000007F800000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0xFFFFFFFFFFFFFFFFul; // -QNaN (all ones payload)
+                yield return 0xFFBFFFFFFFBFFFFFul; // -SNaN (all ones payload)
+                yield return 0x7FFFFFFF7FFFFFFFul; // +QNaN (all ones payload)
+                yield return 0x7FBFFFFF7FBFFFFFul; // +SNaN (all ones payload)
+            }
+
+            for (int Cnt = 1; Cnt <= RndCnt; Cnt++)
+            {
+                ulong Rnd1 = GenNormal_S();
+                ulong Rnd2 = GenSubNormal_S();
+
+                yield return (Rnd1 << 32) | Rnd1;
+                yield return (Rnd2 << 32) | Rnd2;
+            }
+        }
+
+        private static IEnumerable<ulong> _1D_F_()
+        {
+            yield return 0xFFEFFFFFFFFFFFFFul; // -Max Normal (double.MinValue)
+            yield return 0x8010000000000000ul; // -Min Normal
+            yield return 0x800FFFFFFFFFFFFFul; // -Max SubNormal
+            yield return 0x8000000000000001ul; // -Min SubNormal
+            yield return 0x7FEFFFFFFFFFFFFFul; // +Max Normal (double.MaxValue)
+            yield return 0x0010000000000000ul; // +Min Normal
+            yield return 0x000FFFFFFFFFFFFFul; // +Max SubNormal
+            yield return 0x0000000000000001ul; // +Min SubNormal
+
+            if (!NoZeros)
+            {
+                yield return 0x8000000000000000ul; // -Zero
+                yield return 0x0000000000000000ul; // +Zero
+            }
+
+            if (!NoInfs)
+            {
+                yield return 0xFFF0000000000000ul; // -Infinity
+                yield return 0x7FF0000000000000ul; // +Infinity
+            }
+
+            if (!NoNaNs)
+            {
+                yield return 0xFFFFFFFFFFFFFFFFul; // -QNaN (all ones payload)
+                yield return 0xFFF7FFFFFFFFFFFFul; // -SNaN (all ones payload)
+                yield return 0x7FFFFFFFFFFFFFFFul; // +QNaN (all ones payload)
+                yield return 0x7FF7FFFFFFFFFFFFul; // +SNaN (all ones payload)
+            }
+
+            for (int Cnt = 1; Cnt <= RndCnt; Cnt++)
+            {
+                ulong Rnd1 = GenNormal_D();
+                ulong Rnd2 = GenSubNormal_D();
+
+                yield return Rnd1;
+                yield return Rnd2;
+            }
+        }
+#endregion
+
+#region "ValueSource (Opcodes)"
+        private static uint[] _F_Max_Min_Nm_S_S_()
+        {
+            return new uint[]
+            {
+                0x1E224820u, // FMAX   S0, S1, S2
+                0x1E226820u, // FMAXNM S0, S1, S2
+                0x1E225820u, // FMIN   S0, S1, S2
+                0x1E227820u  // FMINNM S0, S1, S2
+            };
+        }
+
+        private static uint[] _F_Max_Min_Nm_S_D_()
+        {
+            return new uint[]
+            {
+                0x1E624820u, // FMAX   D0, D1, D2
+                0x1E626820u, // FMAXNM D0, D1, D2
+                0x1E625820u, // FMIN   D0, D1, D2
+                0x1E627820u  // FMINNM D0, D1, D2
+            };
+        }
+
+        private static uint[] _F_Max_Min_Nm_P_V_2S_4S_()
+        {
+            return new uint[]
+            {
+                0x0E20F400u, // FMAX   V0.2S, V0.2S, V0.2S
+                0x0E20C400u, // FMAXNM V0.2S, V0.2S, V0.2S
+                0x2E20F400u, // FMAXP  V0.2S, V0.2S, V0.2S
+                0x0EA0F400u, // FMIN   V0.2S, V0.2S, V0.2S
+                0x0EA0C400u, // FMINNM V0.2S, V0.2S, V0.2S
+                0x2EA0F400u  // FMINP  V0.2S, V0.2S, V0.2S
+            };
+        }
+
+        private static uint[] _F_Max_Min_Nm_P_V_2D_()
+        {
+            return new uint[]
+            {
+                0x4E60F400u, // FMAX   V0.2D, V0.2D, V0.2D
+                0x4E60C400u, // FMAXNM V0.2D, V0.2D, V0.2D
+                0x6E60F400u, // FMAXP  V0.2D, V0.2D, V0.2D
+                0x4EE0F400u, // FMIN   V0.2D, V0.2D, V0.2D
+                0x4EE0C400u, // FMINNM V0.2D, V0.2D, V0.2D
+                0x6EE0F400u  // FMINP  V0.2D, V0.2D, V0.2D
+            };
+        }
 #endregion
 
         private const int RndCnt = 2;
 
+        private static readonly bool NoZeros = false;
+        private static readonly bool NoInfs  = false;
+        private static readonly bool NoNaNs  = false;
+
         [Test, Pairwise, Description("ADD <V><d>, <V><n>, <V><m>")]
         public void Add_S_D([Values(0u)]     uint Rd,
                             [Values(1u, 0u)] uint Rn,
@@ -856,6 +1035,132 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise, Description("FMADD <Sd>, <Sn>, <Sm>, <Sa>")]
+        public void Fmadd_S_S([ValueSource("_1S_F_")] ulong A,
+                              [ValueSource("_1S_F_")] ulong B,
+                              [ValueSource("_1S_F_")] ulong C)
+        {
+            //const int DNFlagBit = 25; // Default NaN mode control bit.
+            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
+
+            uint Opcode = 0x1F020C20; // FMADD S0, S1, S2, S3
+
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+            Vector128<float> V2 = MakeVectorE0(B);
+            Vector128<float> V3 = MakeVectorE0(C);
+
+            //int Fpcr  = 1 << DNFlagBit; // Any operation involving one or more NaNs returns the Default NaN.
+                //Fpcr |= 1 << FZFlagBit; // Flush-to-zero mode enabled.
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1, V2: V2, V3: V3/*, Fpcr: Fpcr*/);
+
+            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IOC, */FpSkips: FpSkips.IfNaN_S/*, FpUseTolerance: FpUseTolerance.OneUlps_S*/);
+        }
+
+        [Test, Pairwise, Description("FMADD <Dd>, <Dn>, <Dm>, <Da>")]
+        public void Fmadd_S_D([ValueSource("_1D_F_")] ulong A,
+                              [ValueSource("_1D_F_")] ulong B,
+                              [ValueSource("_1D_F_")] ulong C)
+        {
+            uint Opcode = 0x1F420C20; // FMADD D0, D1, D2, D3
+
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE1(Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+            Vector128<float> V2 = MakeVectorE0(B);
+            Vector128<float> V3 = MakeVectorE0(C);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1, V2: V2, V3: V3);
+
+            CompareAgainstUnicorn(FpSkips: FpSkips.IfNaN_D/*, FpUseTolerance: FpUseTolerance.OneUlps_D*/);
+        }
+
+        [Test, Pairwise]
+        public void F_Max_Min_Nm_S_S([ValueSource("_F_Max_Min_Nm_S_S_")] uint Opcodes,
+                                     [ValueSource("_1S_F_")] ulong A,
+                                     [ValueSource("_1S_F_")] ulong B)
+        {
+            //const int DNFlagBit = 25; // Default NaN mode control bit.
+            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
+
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+            Vector128<float> V2 = MakeVectorE0(B);
+
+            //int Fpcr  = 1 << DNFlagBit; // Any operation involving one or more NaNs returns the Default NaN.
+                //Fpcr |= 1 << FZFlagBit; // Flush-to-zero mode enabled.
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1, V2: V2/*, Fpcr: Fpcr*/);
+
+            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IOC*/);
+        }
+
+        [Test, Pairwise]
+        public void F_Max_Min_Nm_S_D([ValueSource("_F_Max_Min_Nm_S_D_")] uint Opcodes,
+                                     [ValueSource("_1D_F_")] ulong A,
+                                     [ValueSource("_1D_F_")] ulong B)
+        {
+            ulong Z = TestContext.CurrentContext.Random.NextULong();
+            Vector128<float> V0 = MakeVectorE1(Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+            Vector128<float> V2 = MakeVectorE0(B);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1, V2: V2);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void F_Max_Min_Nm_P_V_2S_4S([ValueSource("_F_Max_Min_Nm_P_V_2S_4S_")] uint Opcodes,
+                                           [Values(0u)]     uint Rd,
+                                           [Values(1u, 0u)] uint Rn,
+                                           [Values(2u, 0u)] uint Rm,
+                                           [ValueSource("_2S_F_")] ulong Z,
+                                           [ValueSource("_2S_F_")] ulong A,
+                                           [ValueSource("_2S_F_")] ulong B,
+                                           [Values(0b0u, 0b1u)] uint Q) // <2S, 4S>
+        {
+            //const int DNFlagBit = 25; // Default NaN mode control bit.
+            //const int FZFlagBit = 24; // Flush-to-zero mode control bit.
+
+            Opcodes |= ((Rm & 31) << 16) | ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= ((Q & 1) << 30);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0E1(A, A * Q);
+            Vector128<float> V2 = MakeVectorE0E1(B, B * Q);
+
+            //int Fpcr  = 1 << DNFlagBit; // Any operation involving one or more NaNs returns the Default NaN.
+                //Fpcr |= 1 << FZFlagBit; // Flush-to-zero mode enabled.
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1, V2: V2/*, Fpcr: Fpcr*/);
+
+            CompareAgainstUnicorn(/*FpsrMask: FPSR.IDC | FPSR.IOC*/);
+        }
+
+        [Test, Pairwise]
+        public void F_Max_Min_Nm_P_V_2D([ValueSource("_F_Max_Min_Nm_P_V_2D_")] uint Opcodes,
+                                        [Values(0u)]     uint Rd,
+                                        [Values(1u, 0u)] uint Rn,
+                                        [Values(2u, 0u)] uint Rm,
+                                        [ValueSource("_1D_F_")] ulong Z,
+                                        [ValueSource("_1D_F_")] ulong A,
+                                        [ValueSource("_1D_F_")] ulong B)
+        {
+            Opcodes |= ((Rm & 31) << 16) | ((Rn & 31) << 5) | ((Rd & 31) << 0);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0E1(A, A);
+            Vector128<float> V2 = MakeVectorE0E1(B, B);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1, V2: V2);
+
+            CompareAgainstUnicorn();
+        }
+
         [Test, Pairwise, Description("ORN <Vd>.<T>, <Vn>.<T>, <Vm>.<T>")]
         public void Orn_V_8B([Values(0u)]     uint Rd,
                              [Values(1u, 0u)] uint Rn,
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs
index 772852226..edc50d4d0 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs
@@ -20,6 +20,18 @@ namespace Ryujinx.Tests.Cpu
                                  0x8000000000000000ul, 0xFFFFFFFFFFFFFFFFul };
         }
 
+        private static ulong[] _1H_()
+        {
+            return new ulong[] { 0x0000000000000000ul, 0x0000000000007FFFul,
+                                 0x0000000000008000ul, 0x000000000000FFFFul };
+        }
+
+        private static ulong[] _1S_()
+        {
+            return new ulong[] { 0x0000000000000000ul, 0x000000007FFFFFFFul,
+                                 0x0000000080000000ul, 0x00000000FFFFFFFFul };
+        }
+
         private static ulong[] _2S_()
         {
             return new ulong[] { 0x0000000000000000ul, 0x7FFFFFFF7FFFFFFFul,
@@ -114,6 +126,111 @@ namespace Ryujinx.Tests.Cpu
                 0x6F401400u  // USRA  V0.2D, V0.2D, #64
             };
         }
+
+        private static uint[] _ShrImmNarrow_V_8H8B_8H16B_()
+        {
+            return new uint[]
+            {
+                0x0F088C00u, // RSHRN V0.8B, V0.8H, #8
+                0x0F088400u  // SHRN  V0.8B, V0.8H, #8
+            };
+        }
+
+        private static uint[] _ShrImmNarrow_V_4S4H_4S8H_()
+        {
+            return new uint[]
+            {
+                0x0F108C00u, // RSHRN V0.4H, V0.4S, #16
+                0x0F108400u  // SHRN  V0.4H, V0.4S, #16
+            };
+        }
+
+        private static uint[] _ShrImmNarrow_V_2D2S_2D4S_()
+        {
+            return new uint[]
+            {
+                0x0F208C00u, // RSHRN V0.2S, V0.2D, #32
+                0x0F208400u  // SHRN  V0.2S, V0.2D, #32
+            };
+        }
+
+        private static uint[] _ShrImmSaturatingNarrow_S_HB_()
+        {
+            return new uint[]
+            {
+                0x5F089C00u, // SQRSHRN  B0, H0, #8
+                0x7F089C00u, // UQRSHRN  B0, H0, #8
+                0x7F088C00u, // SQRSHRUN B0, H0, #8
+                0x5F089400u, // SQSHRN   B0, H0, #8
+                0x7F089400u, // UQSHRN   B0, H0, #8
+                0x7F088400u  // SQSHRUN  B0, H0, #8
+            };
+        }
+
+        private static uint[] _ShrImmSaturatingNarrow_S_SH_()
+        {
+            return new uint[]
+            {
+                0x5F109C00u, // SQRSHRN  H0, S0, #16
+                0x7F109C00u, // UQRSHRN  H0, S0, #16
+                0x7F108C00u, // SQRSHRUN H0, S0, #16
+                0x5F109400u, // SQSHRN   H0, S0, #16
+                0x7F109400u, // UQSHRN   H0, S0, #16
+                0x7F108400u  // SQSHRUN  H0, S0, #16
+            };
+        }
+
+        private static uint[] _ShrImmSaturatingNarrow_S_DS_()
+        {
+            return new uint[]
+            {
+                0x5F209C00u, // SQRSHRN  S0, D0, #32
+                0x7F209C00u, // UQRSHRN  S0, D0, #32
+                0x7F208C00u, // SQRSHRUN S0, D0, #32
+                0x5F209400u, // SQSHRN   S0, D0, #32
+                0x7F209400u, // UQSHRN   S0, D0, #32
+                0x7F208400u  // SQSHRUN  S0, D0, #32
+            };
+        }
+
+        private static uint[] _ShrImmSaturatingNarrow_V_8H8B_8H16B_()
+        {
+            return new uint[]
+            {
+                0x0F089C00u, // SQRSHRN  V0.8B, V0.8H, #8
+                0x2F089C00u, // UQRSHRN  V0.8B, V0.8H, #8
+                0x2F088C00u, // SQRSHRUN V0.8B, V0.8H, #8
+                0x0F089400u, // SQSHRN   V0.8B, V0.8H, #8
+                0x2F089400u, // UQSHRN   V0.8B, V0.8H, #8
+                0x2F088400u  // SQSHRUN  V0.8B, V0.8H, #8
+            };
+        }
+
+        private static uint[] _ShrImmSaturatingNarrow_V_4S4H_4S8H_()
+        {
+            return new uint[]
+            {
+                0x0F109C00u, // SQRSHRN  V0.4H, V0.4S, #16
+                0x2F109C00u, // UQRSHRN  V0.4H, V0.4S, #16
+                0x2F108C00u, // SQRSHRUN V0.4H, V0.4S, #16
+                0x0F109400u, // SQSHRN   V0.4H, V0.4S, #16
+                0x2F109400u, // UQSHRN   V0.4H, V0.4S, #16
+                0x2F108400u  // SQSHRUN  V0.4H, V0.4S, #16
+            };
+        }
+
+        private static uint[] _ShrImmSaturatingNarrow_V_2D2S_2D4S_()
+        {
+            return new uint[]
+            {
+                0x0F209C00u, // SQRSHRN  V0.2S, V0.2D, #32
+                0x2F209C00u, // UQRSHRN  V0.2S, V0.2D, #32
+                0x2F208C00u, // SQRSHRUN V0.2S, V0.2D, #32
+                0x0F209400u, // SQSHRN   V0.2S, V0.2D, #32
+                0x2F209400u, // UQSHRN   V0.2S, V0.2D, #32
+                0x2F208400u  // SQSHRUN  V0.2S, V0.2D, #32
+            };
+        }
 #endregion
 
         private const int RndCnt = 2;
@@ -339,6 +456,207 @@ namespace Ryujinx.Tests.Cpu
 
             CompareAgainstUnicorn();
         }
+
+        [Test, Pairwise]
+        public void ShrImmNarrow_V_8H8B_8H16B([ValueSource("_ShrImmNarrow_V_8H8B_8H16B_")] uint Opcodes,
+                                              [Values(0u)]     uint Rd,
+                                              [Values(1u, 0u)] uint Rn,
+                                              [ValueSource("_4H_")] [Random(RndCnt)] ulong Z,
+                                              [ValueSource("_4H_")] [Random(RndCnt)] ulong A,
+                                              [Range(1u, 8u)] uint Shift,
+                                              [Values(0b0u, 0b1u)] uint Q) // <8H8B, 8H16B>
+        {
+            uint ImmHB = (16 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+            Opcodes |= ((Q & 1) << 30);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0E1(A, A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void ShrImmNarrow_V_4S4H_4S8H([ValueSource("_ShrImmNarrow_V_4S4H_4S8H_")] uint Opcodes,
+                                             [Values(0u)]     uint Rd,
+                                             [Values(1u, 0u)] uint Rn,
+                                             [ValueSource("_2S_")] [Random(RndCnt)] ulong Z,
+                                             [ValueSource("_2S_")] [Random(RndCnt)] ulong A,
+                                             [Range(1u, 16u)] uint Shift,
+                                             [Values(0b0u, 0b1u)] uint Q) // <4S4H, 4S8H>
+        {
+            uint ImmHB = (32 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+            Opcodes |= ((Q & 1) << 30);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0E1(A, A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void ShrImmNarrow_V_2D2S_2D4S([ValueSource("_ShrImmNarrow_V_2D2S_2D4S_")] uint Opcodes,
+                                             [Values(0u)]     uint Rd,
+                                             [Values(1u, 0u)] uint Rn,
+                                             [ValueSource("_1D_")] [Random(RndCnt)] ulong Z,
+                                             [ValueSource("_1D_")] [Random(RndCnt)] ulong A,
+                                             [Range(1u, 32u)] uint Shift,
+                                             [Values(0b0u, 0b1u)] uint Q) // <2D2S, 2D4S>
+        {
+            uint ImmHB = (64 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+            Opcodes |= ((Q & 1) << 30);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0E1(A, A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Test, Pairwise]
+        public void ShrImmSaturatingNarrow_S_HB([ValueSource("_ShrImmSaturatingNarrow_S_HB_")] uint Opcodes,
+                                                [Values(0u)]     uint Rd,
+                                                [Values(1u, 0u)] uint Rn,
+                                                [ValueSource("_1H_")] [Random(RndCnt)] ulong Z,
+                                                [ValueSource("_1H_")] [Random(RndCnt)] ulong A,
+                                                [Range(1u, 8u)] uint Shift)
+        {
+            uint ImmHB = (16 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn(FpsrMask: FPSR.QC);
+        }
+
+        [Test, Pairwise]
+        public void ShrImmSaturatingNarrow_S_SH([ValueSource("_ShrImmSaturatingNarrow_S_SH_")] uint Opcodes,
+                                                [Values(0u)]     uint Rd,
+                                                [Values(1u, 0u)] uint Rn,
+                                                [ValueSource("_1S_")] [Random(RndCnt)] ulong Z,
+                                                [ValueSource("_1S_")] [Random(RndCnt)] ulong A,
+                                                [Range(1u, 16u)] uint Shift)
+        {
+            uint ImmHB = (32 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn(FpsrMask: FPSR.QC);
+        }
+
+        [Test, Pairwise]
+        public void ShrImmSaturatingNarrow_S_DS([ValueSource("_ShrImmSaturatingNarrow_S_DS_")] uint Opcodes,
+                                                [Values(0u)]     uint Rd,
+                                                [Values(1u, 0u)] uint Rn,
+                                                [ValueSource("_1D_")] [Random(RndCnt)] ulong Z,
+                                                [ValueSource("_1D_")] [Random(RndCnt)] ulong A,
+                                                [Range(1u, 32u)] uint Shift)
+        {
+            uint ImmHB = (64 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn(FpsrMask: FPSR.QC);
+        }
+
+        [Test, Pairwise]
+        public void ShrImmSaturatingNarrow_V_8H8B_8H16B([ValueSource("_ShrImmSaturatingNarrow_V_8H8B_8H16B_")] uint Opcodes,
+                                                        [Values(0u)]     uint Rd,
+                                                        [Values(1u, 0u)] uint Rn,
+                                                        [ValueSource("_4H_")] [Random(RndCnt)] ulong Z,
+                                                        [ValueSource("_4H_")] [Random(RndCnt)] ulong A,
+                                                        [Range(1u, 8u)] uint Shift,
+                                                        [Values(0b0u, 0b1u)] uint Q) // <8H8B, 8H16B>
+        {
+            uint ImmHB = (16 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+            Opcodes |= ((Q & 1) << 30);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn(FpsrMask: FPSR.QC);
+        }
+
+        [Test, Pairwise]
+        public void ShrImmSaturatingNarrow_V_4S4H_4S8H([ValueSource("_ShrImmSaturatingNarrow_V_4S4H_4S8H_")] uint Opcodes,
+                                                       [Values(0u)]     uint Rd,
+                                                       [Values(1u, 0u)] uint Rn,
+                                                       [ValueSource("_2S_")] [Random(RndCnt)] ulong Z,
+                                                       [ValueSource("_2S_")] [Random(RndCnt)] ulong A,
+                                                       [Range(1u, 16u)] uint Shift,
+                                                       [Values(0b0u, 0b1u)] uint Q) // <4S4H, 4S8H>
+        {
+            uint ImmHB = (32 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+            Opcodes |= ((Q & 1) << 30);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn(FpsrMask: FPSR.QC);
+        }
+
+        [Test, Pairwise]
+        public void ShrImmSaturatingNarrow_V_2D2S_2D4S([ValueSource("_ShrImmSaturatingNarrow_V_2D2S_2D4S_")] uint Opcodes,
+                                                       [Values(0u)]     uint Rd,
+                                                       [Values(1u, 0u)] uint Rn,
+                                                       [ValueSource("_1D_")] [Random(RndCnt)] ulong Z,
+                                                       [ValueSource("_1D_")] [Random(RndCnt)] ulong A,
+                                                       [Range(1u, 32u)] uint Shift,
+                                                       [Values(0b0u, 0b1u)] uint Q) // <2D2S, 2D4S>
+        {
+            uint ImmHB = (64 - Shift) & 0x7F;
+
+            Opcodes |= ((Rn & 31) << 5) | ((Rd & 31) << 0);
+            Opcodes |= (ImmHB << 16);
+            Opcodes |= ((Q & 1) << 30);
+
+            Vector128<float> V0 = MakeVectorE0E1(Z, Z);
+            Vector128<float> V1 = MakeVectorE0(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcodes, V0: V0, V1: V1);
+
+            CompareAgainstUnicorn(FpsrMask: FPSR.QC);
+        }
 #endif
     }
 }