diff --git a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
index be549875b..811730fc4 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
@@ -4,6 +4,7 @@ using ChocolArm64.Translation;
 using System;
 using System.Reflection;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
 using static ChocolArm64.Instruction.AInstEmitSimdHelper;
@@ -31,7 +32,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse2)
             {
-                EmitSse2Call(Context, nameof(Sse2.Add));
+                EmitSse2Op(Context, nameof(Sse2.Add));
             }
             else
             {
@@ -175,7 +176,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitScalarSseOrSse2CallF(Context, nameof(Sse.AddScalar));
+                EmitScalarSseOrSse2OpF(Context, nameof(Sse.AddScalar));
             }
             else
             {
@@ -187,7 +188,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitVectorSseOrSse2CallF(Context, nameof(Sse.Add));
+                EmitVectorSseOrSse2OpF(Context, nameof(Sse.Add));
             }
             else
             {
@@ -218,7 +219,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitScalarSseOrSse2CallF(Context, nameof(Sse.DivideScalar));
+                EmitScalarSseOrSse2OpF(Context, nameof(Sse.DivideScalar));
             }
             else
             {
@@ -230,7 +231,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitVectorSseOrSse2CallF(Context, nameof(Sse.Divide));
+                EmitVectorSseOrSse2OpF(Context, nameof(Sse.Divide));
             }
             else
             {
@@ -240,11 +241,49 @@ namespace ChocolArm64.Instruction
 
         public static void Fmadd_S(AILEmitterCtx Context)
         {
-            EmitScalarTernaryRaOpF(Context, () =>
+            if (AOptimizations.UseSse2)
             {
-                Context.Emit(OpCodes.Mul);
-                Context.Emit(OpCodes.Add);
-            });
+                AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
+
+                if (Op.Size == 0)
+                {
+                    Context.EmitLdvec(Op.Ra);
+                    Context.EmitLdvec(Op.Rn);
+                    Context.EmitLdvec(Op.Rm);
+
+                    Type[] Types = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    Context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MultiplyScalar), Types));
+                    Context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AddScalar),      Types));
+
+                    Context.EmitStvec(Op.Rd);
+
+                    EmitVectorZero32_128(Context, Op.Rd);
+                }
+                else /* if (Op.Size == 1) */
+                {
+                    EmitLdvecWithCastToDouble(Context, Op.Ra);
+                    EmitLdvecWithCastToDouble(Context, Op.Rn);
+                    EmitLdvecWithCastToDouble(Context, Op.Rm);
+
+                    Type[] Types = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyScalar), Types));
+                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AddScalar),      Types));
+
+                    EmitStvecWithCastFromDouble(Context, Op.Rd);
+
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
+            }
+            else
+            {
+                EmitScalarTernaryRaOpF(Context, () =>
+                {
+                    Context.Emit(OpCodes.Mul);
+                    Context.Emit(OpCodes.Add);
+                });
+            }
         }
 
         public static void Fmax_S(AILEmitterCtx Context)
@@ -379,7 +418,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitScalarSseOrSse2CallF(Context, nameof(Sse.MultiplyScalar));
+                EmitScalarSseOrSse2OpF(Context, nameof(Sse.MultiplyScalar));
             }
             else
             {
@@ -396,7 +435,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitVectorSseOrSse2CallF(Context, nameof(Sse.Multiply));
+                EmitVectorSseOrSse2OpF(Context, nameof(Sse.Multiply));
             }
             else
             {
@@ -763,7 +802,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitScalarSseOrSse2CallF(Context, nameof(Sse.SubtractScalar));
+                EmitScalarSseOrSse2OpF(Context, nameof(Sse.SubtractScalar));
             }
             else
             {
@@ -775,7 +814,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitVectorSseOrSse2CallF(Context, nameof(Sse.Subtract));
+                EmitVectorSseOrSse2OpF(Context, nameof(Sse.Subtract));
             }
             else
             {
@@ -1103,7 +1142,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse2)
             {
-                EmitSse2Call(Context, nameof(Sse2.Subtract));
+                EmitSse2Op(Context, nameof(Sse2.Subtract));
             }
             else
             {
diff --git a/ChocolArm64/Instruction/AInstEmitSimdCmp.cs b/ChocolArm64/Instruction/AInstEmitSimdCmp.cs
index 6357396d3..97f7623fa 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdCmp.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdCmp.cs
@@ -23,11 +23,11 @@ namespace ChocolArm64.Instruction
             {
                 if (Op.Size < 3 && AOptimizations.UseSse2)
                 {
-                    EmitSse2Call(Context, nameof(Sse2.CompareEqual));
+                    EmitSse2Op(Context, nameof(Sse2.CompareEqual));
                 }
                 else if (Op.Size == 3 && AOptimizations.UseSse41)
                 {
-                    EmitSse41Call(Context, nameof(Sse41.CompareEqual));
+                    EmitSse41Op(Context, nameof(Sse41.CompareEqual));
                 }
                 else
                 {
@@ -61,11 +61,11 @@ namespace ChocolArm64.Instruction
             {
                 if (Op.Size < 3 && AOptimizations.UseSse2)
                 {
-                    EmitSse2Call(Context, nameof(Sse2.CompareGreaterThan));
+                    EmitSse2Op(Context, nameof(Sse2.CompareGreaterThan));
                 }
                 else if (Op.Size == 3 && AOptimizations.UseSse42)
                 {
-                    EmitSse42Call(Context, nameof(Sse42.CompareGreaterThan));
+                    EmitSse42Op(Context, nameof(Sse42.CompareGreaterThan));
                 }
                 else
                 {
@@ -158,7 +158,7 @@ namespace ChocolArm64.Instruction
             if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
                                                  && AOptimizations.UseSse2)
             {
-                EmitScalarSseOrSse2CallF(Context, nameof(Sse.CompareEqualScalar));
+                EmitScalarSseOrSse2OpF(Context, nameof(Sse.CompareEqualScalar));
             }
             else
             {
@@ -171,7 +171,7 @@ namespace ChocolArm64.Instruction
             if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
                                                  && AOptimizations.UseSse2)
             {
-                EmitVectorSseOrSse2CallF(Context, nameof(Sse.CompareEqual));
+                EmitVectorSseOrSse2OpF(Context, nameof(Sse.CompareEqual));
             }
             else
             {
@@ -184,7 +184,7 @@ namespace ChocolArm64.Instruction
             if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
                                                  && AOptimizations.UseSse2)
             {
-                EmitScalarSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanOrEqualScalar));
+                EmitScalarSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThanOrEqualScalar));
             }
             else
             {
@@ -197,7 +197,7 @@ namespace ChocolArm64.Instruction
             if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
                                                  && AOptimizations.UseSse2)
             {
-                EmitVectorSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanOrEqual));
+                EmitVectorSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThanOrEqual));
             }
             else
             {
@@ -210,7 +210,7 @@ namespace ChocolArm64.Instruction
             if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
                                                  && AOptimizations.UseSse2)
             {
-                EmitScalarSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanScalar));
+                EmitScalarSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThanScalar));
             }
             else
             {
@@ -223,7 +223,7 @@ namespace ChocolArm64.Instruction
             if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
                                                  && AOptimizations.UseSse2)
             {
-                EmitVectorSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThan));
+                EmitVectorSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThan));
             }
             else
             {
diff --git a/ChocolArm64/Instruction/AInstEmitSimdCvt.cs b/ChocolArm64/Instruction/AInstEmitSimdCvt.cs
index 231de0aff..76d984a23 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdCvt.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdCvt.cs
@@ -3,6 +3,8 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 
 using static ChocolArm64.Instruction.AInstEmitSimdHelper;
 
@@ -14,11 +16,48 @@ namespace ChocolArm64.Instruction
         {
             AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
 
-            EmitVectorExtractF(Context, Op.Rn, 0, Op.Size);
+            if (AOptimizations.UseSse2)
+            {
+                if (Op.Size == 1 && Op.Opc == 0)
+                {
+                    //Double -> Single.
+                    AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleZero));
 
-            EmitFloatCast(Context, Op.Opc);
+                    EmitLdvecWithCastToDouble(Context, Op.Rn);
 
-            EmitScalarSetF(Context, Op.Rd, Op.Opc);
+                    Type[] Types = new Type[] { typeof(Vector128<float>), typeof(Vector128<double>) };
+
+                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Single), Types));
+
+                    Context.EmitStvec(Op.Rd);
+                }
+                else if (Op.Size == 0 && Op.Opc == 1)
+                {
+                    //Single -> Double.
+                    AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorDoubleZero));
+
+                    Context.EmitLdvec(Op.Rn);
+
+                    Type[] Types = new Type[] { typeof(Vector128<double>), typeof(Vector128<float>) };
+
+                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Double), Types));
+
+                    EmitStvecWithCastFromDouble(Context, Op.Rd);
+                }
+                else
+                {
+                    //Invalid encoding.
+                    throw new InvalidOperationException();
+                }
+            }
+            else
+            {
+                EmitVectorExtractF(Context, Op.Rn, 0, Op.Size);
+
+                EmitFloatCast(Context, Op.Opc);
+
+                EmitScalarSetF(Context, Op.Rd, Op.Opc);
+            }
         }
 
         public static void Fcvtas_Gp(AILEmitterCtx Context)
diff --git a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
index 171de43be..381fc46ac 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
@@ -4,7 +4,6 @@ using ChocolArm64.Translation;
 using System;
 using System.Reflection;
 using System.Reflection.Emit;
-using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
@@ -12,6 +11,38 @@ namespace ChocolArm64.Instruction
 {
     static class AInstEmitSimdHelper
     {
+        public static readonly Type[] IntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(sbyte),
+            typeof(short),
+            typeof(int),
+            typeof(long)
+        };
+
+        public static readonly Type[] UIntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(byte),
+            typeof(ushort),
+            typeof(uint),
+            typeof(ulong)
+        };
+
+        public static readonly Type[] VectorIntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(Vector128<sbyte>),
+            typeof(Vector128<short>),
+            typeof(Vector128<int>),
+            typeof(Vector128<long>)
+        };
+
+        public static readonly Type[] VectorUIntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(Vector128<byte>),
+            typeof(Vector128<ushort>),
+            typeof(Vector128<uint>),
+            typeof(Vector128<ulong>)
+        };
+
         [Flags]
         public enum OperFlags
         {
@@ -36,56 +67,32 @@ namespace ChocolArm64.Instruction
             return (8 << (Op.Size + 1)) - Op.Imm;
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void EmitSse2Call(AILEmitterCtx Context, string Name)
+        public static void EmitSse2Op(AILEmitterCtx Context, string Name)
         {
-            EmitSseCall(Context, Name, typeof(Sse2));
+            EmitSseOp(Context, Name, typeof(Sse2));
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void EmitSse41Call(AILEmitterCtx Context, string Name)
+        public static void EmitSse41Op(AILEmitterCtx Context, string Name)
         {
-            EmitSseCall(Context, Name, typeof(Sse41));
+            EmitSseOp(Context, Name, typeof(Sse41));
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void EmitSse42Call(AILEmitterCtx Context, string Name)
+        public static void EmitSse42Op(AILEmitterCtx Context, string Name)
         {
-            EmitSseCall(Context, Name, typeof(Sse42));
+            EmitSseOp(Context, Name, typeof(Sse42));
         }
 
-        private static void EmitSseCall(AILEmitterCtx Context, string Name, Type Type)
+        private static void EmitSseOp(AILEmitterCtx Context, string Name, Type Type)
         {
             AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
 
-            void Ldvec(int Reg)
-            {
-                Context.EmitLdvec(Reg);
+            EmitLdvecWithSignedCast(Context, Op.Rn, Op.Size);
 
-                switch (Op.Size)
-                {
-                    case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToSByte)); break;
-                    case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt16)); break;
-                    case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt32)); break;
-                    case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt64)); break;
-                }
-            }
-
-            Ldvec(Op.Rn);
-
-            Type BaseType = null;
-
-            switch (Op.Size)
-            {
-                case 0: BaseType = typeof(Vector128<sbyte>); break;
-                case 1: BaseType = typeof(Vector128<short>); break;
-                case 2: BaseType = typeof(Vector128<int>);   break;
-                case 3: BaseType = typeof(Vector128<long>);  break;
-            }
+            Type BaseType = VectorIntTypesPerSizeLog2[Op.Size];
 
             if (Op is AOpCodeSimdReg BinOp)
             {
-                Ldvec(BinOp.Rm);
+                EmitLdvecWithSignedCast(Context, BinOp.Rm, Op.Size);
 
                 Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType, BaseType }));
             }
@@ -94,15 +101,7 @@ namespace ChocolArm64.Instruction
                 Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType }));
             }
 
-            switch (Op.Size)
-            {
-                case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSByteToSingle)); break;
-                case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16ToSingle)); break;
-                case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32ToSingle)); break;
-                case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt64ToSingle)); break;
-            }
-
-            Context.EmitStvec(Op.Rd);
+            EmitStvecWithSignedCast(Context, Op.Rd, Op.Size);
 
             if (Op.RegisterSize == ARegisterSize.SIMD64)
             {
@@ -110,17 +109,91 @@ namespace ChocolArm64.Instruction
             }
         }
 
-        public static void EmitScalarSseOrSse2CallF(AILEmitterCtx Context, string Name)
+        public static void EmitLdvecWithSignedCast(AILEmitterCtx Context, int Reg, int Size)
         {
-            EmitSseOrSse2CallF(Context, Name, true);
+            Context.EmitLdvec(Reg);
+
+            switch (Size)
+            {
+                case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToSByte)); break;
+                case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt16)); break;
+                case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt32)); break;
+                case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt64)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(Size));
+            }
         }
 
-        public static void EmitVectorSseOrSse2CallF(AILEmitterCtx Context, string Name)
+        public static void EmitLdvecWithCastToDouble(AILEmitterCtx Context, int Reg)
         {
-            EmitSseOrSse2CallF(Context, Name, false);
+            Context.EmitLdvec(Reg);
+
+            AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToDouble));
         }
 
-        public static void EmitSseOrSse2CallF(AILEmitterCtx Context, string Name, bool Scalar)
+        public static void EmitStvecWithCastFromDouble(AILEmitterCtx Context, int Reg)
+        {
+            AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorDoubleToSingle));
+
+            Context.EmitStvec(Reg);
+        }
+
+        public static void EmitLdvecWithUnsignedCast(AILEmitterCtx Context, int Reg, int Size)
+        {
+            Context.EmitLdvec(Reg);
+
+            switch (Size)
+            {
+                case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToByte));   break;
+                case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToUInt16)); break;
+                case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToUInt32)); break;
+                case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToUInt64)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(Size));
+            }
+        }
+
+        public static void EmitStvecWithSignedCast(AILEmitterCtx Context, int Reg, int Size)
+        {
+            switch (Size)
+            {
+                case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSByteToSingle)); break;
+                case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16ToSingle)); break;
+                case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32ToSingle)); break;
+                case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt64ToSingle)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(Size));
+            }
+
+            Context.EmitStvec(Reg);
+        }
+
+        public static void EmitStvecWithUnsignedCast(AILEmitterCtx Context, int Reg, int Size)
+        {
+            switch (Size)
+            {
+                case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorByteToSingle));   break;
+                case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorUInt16ToSingle)); break;
+                case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorUInt32ToSingle)); break;
+                case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorUInt64ToSingle)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(Size));
+            }
+
+            Context.EmitStvec(Reg);
+        }
+
+        public static void EmitScalarSseOrSse2OpF(AILEmitterCtx Context, string Name)
+        {
+            EmitSseOrSse2OpF(Context, Name, true);
+        }
+
+        public static void EmitVectorSseOrSse2OpF(AILEmitterCtx Context, string Name)
+        {
+            EmitSseOrSse2OpF(Context, Name, false);
+        }
+
+        public static void EmitSseOrSse2OpF(AILEmitterCtx Context, string Name, bool Scalar)
         {
             AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
 
@@ -1183,8 +1256,21 @@ namespace ChocolArm64.Instruction
 
         public static void EmitScalarSetF(AILEmitterCtx Context, int Reg, int Size)
         {
-            EmitVectorZeroAll(Context, Reg);
-            EmitVectorInsertF(Context, Reg, 0, Size);
+            if (AOptimizations.UseSse41 && Size == 0)
+            {
+                //If the type is float, we can perform insertion and
+                //zero the upper bits with a single instruction (INSERTPS);
+                Context.EmitLdvec(Reg);
+
+                AVectorHelper.EmitCall(Context, nameof(AVectorHelper.Sse41VectorInsertScalarSingle));
+
+                Context.EmitStvec(Reg);
+            }
+            else
+            {
+                EmitVectorZeroAll(Context, Reg);
+                EmitVectorInsertF(Context, Reg, 0, Size);
+            }
         }
 
         public static void EmitVectorExtractSx(AILEmitterCtx Context, int Reg, int Index, int Size)
@@ -1235,8 +1321,17 @@ namespace ChocolArm64.Instruction
 
         public static void EmitVectorZeroAll(AILEmitterCtx Context, int Rd)
         {
-            EmitVectorZeroLower(Context, Rd);
-            EmitVectorZeroUpper(Context, Rd);
+            if (AOptimizations.UseSse2)
+            {
+                AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleZero));
+
+                Context.EmitStvec(Rd);
+            }
+            else
+            {
+                EmitVectorZeroLower(Context, Rd);
+                EmitVectorZeroUpper(Context, Rd);
+            }
         }
 
         public static void EmitVectorZeroLower(AILEmitterCtx Context, int Rd)
@@ -1249,9 +1344,32 @@ namespace ChocolArm64.Instruction
             EmitVectorInsertTmp(Context, 0, 3, 0);
         }
 
-        public static void EmitVectorZeroUpper(AILEmitterCtx Context, int Rd)
+        public static void EmitVectorZeroUpper(AILEmitterCtx Context, int Reg)
         {
-            EmitVectorInsert(Context, Rd, 1, 3, 0);
+            if (AOptimizations.UseSse2)
+            {
+                //TODO: Use MoveScalar once it is fixed, as of the
+                //time of writing it just crashes the JIT.
+                EmitLdvecWithUnsignedCast(Context, Reg, 3);
+
+                Type[] Types = new Type[] { typeof(Vector128<ulong>), typeof(byte) };
+
+                //Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), Types));
+
+                Context.EmitLdc_I4(8);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical128BitLane), Types));
+
+                Context.EmitLdc_I4(8);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), Types));
+
+                EmitStvecWithUnsignedCast(Context, Reg, 3);
+            }
+            else
+            {
+                EmitVectorInsert(Context, Reg, 1, 3, 0);
+            }
         }
 
         public static void EmitVectorZero32_128(AILEmitterCtx Context, int Reg)
diff --git a/ChocolArm64/Instruction/AInstEmitSimdLogical.cs b/ChocolArm64/Instruction/AInstEmitSimdLogical.cs
index 9f5af96cb..1aa8981f5 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdLogical.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdLogical.cs
@@ -15,7 +15,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse2)
             {
-                EmitSse2Call(Context, nameof(Sse2.And));
+                EmitSse2Op(Context, nameof(Sse2.And));
             }
             else
             {
@@ -25,11 +25,36 @@ namespace ChocolArm64.Instruction
 
         public static void Bic_V(AILEmitterCtx Context)
         {
-            EmitVectorBinaryOpZx(Context, () =>
+            if (AOptimizations.UseSse2)
             {
-                Context.Emit(OpCodes.Not);
-                Context.Emit(OpCodes.And);
-            });
+                AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
+
+                EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);
+                EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size);
+
+                Type[] Types = new Type[]
+                {
+                    VectorUIntTypesPerSizeLog2[Op.Size],
+                    VectorUIntTypesPerSizeLog2[Op.Size]
+                };
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), Types));
+
+                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(Context, () =>
+                {
+                    Context.Emit(OpCodes.Not);
+                    Context.Emit(OpCodes.And);
+                });
+            }
         }
 
         public static void Bic_Vi(AILEmitterCtx Context)
@@ -55,59 +80,124 @@ namespace ChocolArm64.Instruction
         {
             AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
 
-            int Bytes = Op.GetBitsCount() >> 3;
-            int Elems = Bytes >> Op.Size;
-
-            for (int Index = 0; Index < Elems; Index++)
+            if (AOptimizations.UseSse2)
             {
-                EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size);
-                EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size);
-
-                Context.Emit(OpCodes.Xor);
-
-                EmitVectorExtractZx(Context, Op.Rm, Index, Op.Size);
-
-                if (NotRm)
+                Type[] Types = new Type[]
                 {
-                    Context.Emit(OpCodes.Not);
+                    VectorUIntTypesPerSizeLog2[Op.Size],
+                    VectorUIntTypesPerSizeLog2[Op.Size]
+                };
+
+                EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);
+                EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+                EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types));
+
+                string Name = NotRm ? nameof(Sse2.AndNot) : nameof(Sse2.And);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(Name, Types));
+
+                EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types));
+
+                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
+            }
+            else
+            {
+                int Bytes = Op.GetBitsCount() >> 3;
+                int Elems = Bytes >> Op.Size;
+
+                for (int Index = 0; Index < Elems; Index++)
+                {
+                    EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size);
+                    EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size);
+
+                    Context.Emit(OpCodes.Xor);
+
+                    EmitVectorExtractZx(Context, Op.Rm, Index, Op.Size);
+
+                    if (NotRm)
+                    {
+                        Context.Emit(OpCodes.Not);
+                    }
+
+                    Context.Emit(OpCodes.And);
+
+                    EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size);
+
+                    Context.Emit(OpCodes.Xor);
+
+                    EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
                 }
 
-                Context.Emit(OpCodes.And);
-
-                EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size);
-
-                Context.Emit(OpCodes.Xor);
-
-                EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
-            }
-
-            if (Op.RegisterSize == ARegisterSize.SIMD64)
-            {
-                EmitVectorZeroUpper(Context, Op.Rd);
+                if (Op.RegisterSize == ARegisterSize.SIMD64)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
             }
         }
 
         public static void Bsl_V(AILEmitterCtx Context)
         {
-            EmitVectorTernaryOpZx(Context, () =>
+            if (AOptimizations.UseSse2)
             {
-                Context.EmitSttmp();
-                Context.EmitLdtmp();
+                AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
 
-                Context.Emit(OpCodes.Xor);
-                Context.Emit(OpCodes.And);
+                Type[] Types = new Type[]
+                {
+                    VectorUIntTypesPerSizeLog2[Op.Size],
+                    VectorUIntTypesPerSizeLog2[Op.Size]
+                };
 
-                Context.EmitLdtmp();
+                EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size);
+                EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);
 
-                Context.Emit(OpCodes.Xor);
-            });
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types));
+
+                EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), Types));
+
+                EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types));
+
+                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorTernaryOpZx(Context, () =>
+                {
+                    Context.EmitSttmp();
+                    Context.EmitLdtmp();
+
+                    Context.Emit(OpCodes.Xor);
+                    Context.Emit(OpCodes.And);
+
+                    Context.EmitLdtmp();
+
+                    Context.Emit(OpCodes.Xor);
+                });
+            }
         }
 
         public static void Eor_V(AILEmitterCtx Context)
         {
             if (AOptimizations.UseSse2)
             {
-                EmitSse2Call(Context, nameof(Sse2.Xor));
+                EmitSse2Op(Context, nameof(Sse2.Xor));
             }
             else
             {
@@ -133,7 +223,7 @@ namespace ChocolArm64.Instruction
         {
             if (AOptimizations.UseSse2)
             {
-                EmitSse2Call(Context, nameof(Sse2.Or));
+                EmitSse2Op(Context, nameof(Sse2.Or));
             }
             else
             {
diff --git a/ChocolArm64/Instruction/AInstEmitSimdMove.cs b/ChocolArm64/Instruction/AInstEmitSimdMove.cs
index 3bf1e4635..94097f480 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdMove.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdMove.cs
@@ -3,6 +3,7 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
 
 using static ChocolArm64.Instruction.AInstEmitSimdHelper;
 
@@ -14,19 +15,44 @@ namespace ChocolArm64.Instruction
         {
             AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
 
-            int Bytes = Op.GetBitsCount() >> 3;
-            int Elems = Bytes >> Op.Size;
-
-            for (int Index = 0; Index < Elems; Index++)
+            if (AOptimizations.UseSse2)
             {
                 Context.EmitLdintzr(Op.Rn);
 
-                EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
-            }
+                switch (Op.Size)
+                {
+                    case 0: Context.Emit(OpCodes.Conv_U1); break;
+                    case 1: Context.Emit(OpCodes.Conv_U2); break;
+                    case 2: Context.Emit(OpCodes.Conv_U4); break;
+                }
 
-            if (Op.RegisterSize == ARegisterSize.SIMD64)
+                Type[] Types = new Type[] { UIntTypesPerSizeLog2[Op.Size] };
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), Types));
+
+                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
+            }
+            else
             {
-                EmitVectorZeroUpper(Context, Op.Rd);
+                int Bytes = Op.GetBitsCount() >> 3;
+                int Elems = Bytes >> Op.Size;
+
+                for (int Index = 0; Index < Elems; Index++)
+                {
+                    Context.EmitLdintzr(Op.Rn);
+
+                    EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
+                }
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
             }
         }
 
@@ -295,25 +321,91 @@ namespace ChocolArm64.Instruction
 
             int Part = Op.RegisterSize == ARegisterSize.SIMD128 ? Elems : 0;
 
-            if (Part != 0)
+            if (AOptimizations.UseSse41 && Op.Size < 2)
             {
-                Context.EmitLdvec(Op.Rd);
-                Context.EmitStvectmp();
+                void EmitZeroVector()
+                {
+                    switch (Op.Size)
+                    {
+                        case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16Zero)); break;
+                        case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32Zero)); break;
+                    }
+                }
+
+                //For XTN, first operand is source, second operand is 0.
+                //For XTN2, first operand is 0, second operand is source.
+                if (Part != 0)
+                {
+                    EmitZeroVector();
+                }
+
+                EmitLdvecWithSignedCast(Context, Op.Rn, Op.Size + 1);
+
+                //Set mask to discard the upper half of the wide elements.
+                switch (Op.Size)
+                {
+                    case 0: Context.EmitLdc_I4(0x00ff);     break;
+                    case 1: Context.EmitLdc_I4(0x0000ffff); break;
+                }
+
+                Type WideType = IntTypesPerSizeLog2[Op.Size + 1];
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), new Type[] { WideType }));
+
+                WideType = VectorIntTypesPerSizeLog2[Op.Size + 1];
+
+                Type[] WideTypes = new Type[] { WideType, WideType };
+
+                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), WideTypes));
+
+                if (Part == 0)
+                {
+                    EmitZeroVector();
+                }
+
+                //Pack values with signed saturation, the signed saturation shouldn't
+                //saturate anything since the upper bits were masked off.
+                Type SseType = Op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                Context.EmitCall(SseType.GetMethod(nameof(Sse2.PackUnsignedSaturate), WideTypes));
+
+                if (Part != 0)
+                {
+                    //For XTN2, we additionally need to discard the upper bits
+                    //of the target register and OR the result with it.
+                    EmitVectorZeroUpper(Context, Op.Rd);
+
+                    EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                    Type NarrowType = VectorUIntTypesPerSizeLog2[Op.Size];
+
+                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), new Type[] { NarrowType, NarrowType }));
+                }
+
+                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
             }
-
-            for (int Index = 0; Index < Elems; Index++)
+            else
             {
-                EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1);
+                if (Part != 0)
+                {
+                    Context.EmitLdvec(Op.Rd);
+                    Context.EmitStvectmp();
+                }
 
-                EmitVectorInsertTmp(Context, Part + Index, Op.Size);
-            }
+                for (int Index = 0; Index < Elems; Index++)
+                {
+                    EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1);
 
-            Context.EmitLdvectmp();
-            Context.EmitStvec(Op.Rd);
+                    EmitVectorInsertTmp(Context, Part + Index, Op.Size);
+                }
 
-            if (Part == 0)
-            {
-                EmitVectorZeroUpper(Context, Op.Rd);
+                Context.EmitLdvectmp();
+                Context.EmitStvec(Op.Rd);
+
+                if (Part == 0)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
             }
         }
 
@@ -394,28 +486,64 @@ namespace ChocolArm64.Instruction
         {
             AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
 
-            int Words = Op.GetBitsCount() >> 4;
-            int Pairs = Words >> Op.Size;
-
-            int Base = Part != 0 ? Pairs : 0;
-
-            for (int Index = 0; Index < Pairs; Index++)
+            if (AOptimizations.UseSse2)
             {
-                int Idx = Index << 1;
+                EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size);
+                EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);
 
-                EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size);
-                EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size);
+                Type[] Types = new Type[]
+                {
+                    VectorUIntTypesPerSizeLog2[Op.Size],
+                    VectorUIntTypesPerSizeLog2[Op.Size]
+                };
 
-                EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
-                EmitVectorInsertTmp(Context, Idx,     Op.Size);
+                string Name = Part == 0 || (Part != 0 && Op.RegisterSize == ARegisterSize.SIMD64)
+                    ? nameof(Sse2.UnpackLow)
+                    : nameof(Sse2.UnpackHigh);
+
+                Context.EmitCall(typeof(Sse2).GetMethod(Name, Types));
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64 && Part != 0)
+                {
+                    Context.EmitLdc_I4(8);
+
+                    Type[] ShTypes = new Type[] { VectorUIntTypesPerSizeLog2[Op.Size], typeof(byte) };
+
+                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), ShTypes));
+                }
+
+                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64 && Part == 0)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
             }
-
-            Context.EmitLdvectmp();
-            Context.EmitStvec(Op.Rd);
-
-            if (Op.RegisterSize == ARegisterSize.SIMD64)
+            else
             {
-                EmitVectorZeroUpper(Context, Op.Rd);
+                int Words = Op.GetBitsCount() >> 4;
+                int Pairs = Words >> Op.Size;
+
+                int Base = Part != 0 ? Pairs : 0;
+
+                for (int Index = 0; Index < Pairs; Index++)
+                {
+                    int Idx = Index << 1;
+
+                    EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size);
+                    EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size);
+
+                    EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
+                    EmitVectorInsertTmp(Context, Idx,     Op.Size);
+                }
+
+                Context.EmitLdvectmp();
+                Context.EmitStvec(Op.Rd);
+
+                if (Op.RegisterSize == ARegisterSize.SIMD64)
+                {
+                    EmitVectorZeroUpper(Context, Op.Rd);
+                }
             }
         }
     }
diff --git a/ChocolArm64/Instruction/AVectorHelper.cs b/ChocolArm64/Instruction/AVectorHelper.cs
index 3e4452abb..7f9d98cd8 100644
--- a/ChocolArm64/Instruction/AVectorHelper.cs
+++ b/ChocolArm64/Instruction/AVectorHelper.cs
@@ -227,7 +227,16 @@ namespace ChocolArm64.Instruction
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static double VectorExtractDouble(Vector128<float> Vector, byte Index)
         {
-            return BitConverter.Int64BitsToDouble(VectorExtractIntSx(Vector, Index, 3));
+            if (Sse41.IsSupported)
+            {
+                return BitConverter.Int64BitsToDouble(Sse41.Extract(Sse.StaticCast<float, long>(Vector), Index));
+            }
+            else if (Sse2.IsSupported)
+            {
+                return BitConverter.Int64BitsToDouble((long)VectorExtractIntZx(Vector, Index, 3));
+            }
+
+            throw new PlatformNotSupportedException();
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -235,41 +244,49 @@ namespace ChocolArm64.Instruction
         {
             if (Sse41.IsSupported)
             {
-                switch (Size)
+                if (Size == 0)
                 {
-                    case 0:
-                        return (sbyte)Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
-
-                    case 1:
-                        return (short)Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
-
-                    case 2:
-                        return Sse41.Extract(Sse.StaticCast<float, int>(Vector), Index);
-
-                    case 3:
-                        return Sse41.Extract(Sse.StaticCast<float, long>(Vector), Index);
+                    return (sbyte)Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
+                }
+                else if (Size == 1)
+                {
+                    return (short)Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
+                }
+                else if (Size == 2)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, int>(Vector), Index);
+                }
+                else if (Size == 3)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, long>(Vector), Index);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(Size));
                 }
-
-                throw new ArgumentOutOfRangeException(nameof(Size));
             }
             else if (Sse2.IsSupported)
             {
-                switch (Size)
+                if (Size == 0)
                 {
-                    case 0:
-                        return (sbyte)VectorExtractIntZx(Vector, Index, Size);
-
-                    case 1:
-                        return (short)VectorExtractIntZx(Vector, Index, Size);
-
-                    case 2:
-                        return (int)VectorExtractIntZx(Vector, Index, Size);
-
-                    case 3:
-                        return (long)VectorExtractIntZx(Vector, Index, Size);
+                    return (sbyte)VectorExtractIntZx(Vector, Index, Size);
+                }
+                else if (Size == 1)
+                {
+                    return (short)VectorExtractIntZx(Vector, Index, Size);
+                }
+                else if (Size == 2)
+                {
+                    return (int)VectorExtractIntZx(Vector, Index, Size);
+                }
+                else if (Size == 3)
+                {
+                    return (long)VectorExtractIntZx(Vector, Index, Size);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(Size));
                 }
-
-                throw new ArgumentOutOfRangeException(nameof(Size));
             }
 
             throw new PlatformNotSupportedException();
@@ -280,22 +297,26 @@ namespace ChocolArm64.Instruction
         {
             if (Sse41.IsSupported)
             {
-                switch (Size)
+                if (Size == 0)
                 {
-                    case 0:
-                        return Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
-
-                    case 1:
-                        return Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
-
-                    case 2:
-                        return Sse41.Extract(Sse.StaticCast<float, uint>(Vector), Index);
-
-                    case 3:
-                        return Sse41.Extract(Sse.StaticCast<float, ulong>(Vector), Index);
+                    return Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
+                }
+                else if (Size == 1)
+                {
+                    return Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
+                }
+                else if (Size == 2)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, uint>(Vector), Index);
+                }
+                else if (Size == 3)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, ulong>(Vector), Index);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(Size));
                 }
-
-                throw new ArgumentOutOfRangeException(nameof(Size));
             }
             else if (Sse2.IsSupported)
             {
@@ -305,35 +326,35 @@ namespace ChocolArm64.Instruction
 
                 ushort Value = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)ShortIdx);
 
-                switch (Size)
+                if (Size == 0)
                 {
-                    case 0:
-                        return (byte)(Value >> (Index & 1) * 8);
-
-                    case 1:
-                        return Value;
-
-                    case 2:
-                    case 3:
-                    {
-                        ushort Value1 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 1));
-
-                        if (Size == 2)
-                        {
-                            return (uint)(Value | (Value1 << 16));
-                        }
-
-                        ushort Value2 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 2));
-                        ushort Value3 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 3));
-
-                        return ((ulong)Value  <<  0) |
-                               ((ulong)Value1 << 16) |
-                               ((ulong)Value2 << 32) |
-                               ((ulong)Value3 << 48);
-                    }
+                    return (byte)(Value >> (Index & 1) * 8);
                 }
+                else if (Size == 1)
+                {
+                    return Value;
+                }
+                else if (Size == 2 || Size == 3)
+                {
+                    ushort Value1 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 1));
 
-                throw new ArgumentOutOfRangeException(nameof(Size));
+                    if (Size == 2)
+                    {
+                        return (uint)(Value | (Value1 << 16));
+                    }
+
+                    ushort Value2 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 2));
+                    ushort Value3 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 3));
+
+                    return ((ulong)Value  <<  0) |
+                           ((ulong)Value1 << 16) |
+                           ((ulong)Value2 << 32) |
+                           ((ulong)Value3 << 48);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(Size));
+                }
             }
 
             throw new PlatformNotSupportedException();
@@ -370,22 +391,26 @@ namespace ChocolArm64.Instruction
         {
             if (Sse41.IsSupported)
             {
-                switch (Size)
+                if (Size == 0)
                 {
-                    case 0:
-                        return Sse.StaticCast<byte, float>(Sse41.Insert(Sse.StaticCast<float, byte>(Vector), (byte)Value, Index));
-
-                    case 1:
-                        return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
-
-                    case 2:
-                        return Sse.StaticCast<uint, float>(Sse41.Insert(Sse.StaticCast<float, uint>(Vector), (uint)Value, Index));
-
-                    case 3:
-                        return Sse.StaticCast<ulong, float>(Sse41.Insert(Sse.StaticCast<float, ulong>(Vector), Value, Index));
+                    return Sse.StaticCast<byte, float>(Sse41.Insert(Sse.StaticCast<float, byte>(Vector), (byte)Value, Index));
+                }
+                else if (Size == 1)
+                {
+                    return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
+                }
+                else if (Size == 2)
+                {
+                    return Sse.StaticCast<uint, float>(Sse41.Insert(Sse.StaticCast<float, uint>(Vector), (uint)Value, Index));
+                }
+                else if (Size == 3)
+                {
+                    return Sse.StaticCast<ulong, float>(Sse41.Insert(Sse.StaticCast<float, ulong>(Vector), Value, Index));
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(Size));
                 }
-
-                throw new ArgumentOutOfRangeException(nameof(Size));
             }
             else if (Sse2.IsSupported)
             {
@@ -395,41 +420,39 @@ namespace ChocolArm64.Instruction
                     ? Index >> 1
                     : Index << (Size - 1);
 
-                switch (Size)
+                if (Size == 0)
                 {
-                    case 0:
-                    {
-                        ushort ShortVal = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)ShortIdx);
+                    ushort ShortVal = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)ShortIdx);
 
-                        int Shift = (Index & 1) * 8;
+                    int Shift = (Index & 1) * 8;
 
-                        ShortVal &= (ushort)(0xff00 >> Shift);
+                    ShortVal &= (ushort)(0xff00 >> Shift);
 
-                        ShortVal |= (ushort)((byte)Value << Shift);
+                    ShortVal |= (ushort)((byte)Value << Shift);
 
-                        return Sse.StaticCast<ushort, float>(Sse2.Insert(ShortVector, ShortVal, (byte)ShortIdx));
-                    }
-
-                    case 1:
-                        return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
-
-                    case 2:
-                    case 3:
-                    {
-                        ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >>  0), (byte)(ShortIdx + 0));
-                        ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 16), (byte)(ShortIdx + 1));
-
-                        if (Size == 3)
-                        {
-                            ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 32), (byte)(ShortIdx + 2));
-                            ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 48), (byte)(ShortIdx + 3));
-                        }
-
-                        return Sse.StaticCast<ushort, float>(ShortVector);
-                    }
+                    return Sse.StaticCast<ushort, float>(Sse2.Insert(ShortVector, ShortVal, (byte)ShortIdx));
                 }
+                else if (Size == 1)
+                {
+                    return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
+                }
+                else if (Size == 2 || Size == 3)
+                {
+                    ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >>  0), (byte)(ShortIdx + 0));
+                    ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 16), (byte)(ShortIdx + 1));
 
-                throw new ArgumentOutOfRangeException(nameof(Size));
+                    if (Size == 3)
+                    {
+                        ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 32), (byte)(ShortIdx + 2));
+                        ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 48), (byte)(ShortIdx + 3));
+                    }
+
+                    return Sse.StaticCast<ushort, float>(ShortVector);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(Size));
+                }
             }
 
             throw new PlatformNotSupportedException();
@@ -440,7 +463,29 @@ namespace ChocolArm64.Instruction
         {
             if (Sse41.IsSupported)
             {
-                return Sse41.Insert(Vector, Value, (byte)(Index << 4));
+                //Note: The if/else if is necessary to enable the JIT to
+                //produce a single INSERTPS instruction instead of the
+                //jump table fallback.
+                if (Index == 0)
+                {
+                    return Sse41.Insert(Vector, Value, 0x00);
+                }
+                else if (Index == 1)
+                {
+                    return Sse41.Insert(Vector, Value, 0x10);
+                }
+                else if (Index == 2)
+                {
+                    return Sse41.Insert(Vector, Value, 0x20);
+                }
+                else if (Index == 3)
+                {
+                    return Sse41.Insert(Vector, Value, 0x30);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(Index));
+                }
             }
             else if (Sse2.IsSupported)
             {
@@ -460,6 +505,79 @@ namespace ChocolArm64.Instruction
             throw new PlatformNotSupportedException();
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> Sse41VectorInsertScalarSingle(float Value, Vector128<float> Vector)
+        {
+            //Note: 0b1110 is the mask to zero the upper bits.
+            return Sse41.Insert(Vector, Value, 0b1110);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<sbyte> VectorSByteZero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<sbyte>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<short> VectorInt16Zero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<short>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<int> VectorInt32Zero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<int>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<long> VectorInt64Zero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<long>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorSingleZero()
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.SetZeroVector128();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<double> VectorDoubleZero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<double>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Vector128<float> VectorZero32_128(Vector128<float> Vector)
         {
@@ -515,6 +633,50 @@ namespace ChocolArm64.Instruction
             throw new PlatformNotSupportedException();
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<byte> VectorSingleToByte(Vector128<float> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, byte>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ushort> VectorSingleToUInt16(Vector128<float> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, ushort>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<uint> VectorSingleToUInt32(Vector128<float> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, uint>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> VectorSingleToUInt64(Vector128<float> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, ulong>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Vector128<double> VectorSingleToDouble(Vector128<float> Vector)
         {
@@ -570,6 +732,50 @@ namespace ChocolArm64.Instruction
             throw new PlatformNotSupportedException();
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorByteToSingle(Vector128<byte> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<byte, float>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorUInt16ToSingle(Vector128<ushort> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<ushort, float>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorUInt32ToSingle(Vector128<uint> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<uint, float>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorUInt64ToSingle(Vector128<ulong> Vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<ulong, float>(Vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static Vector128<float> VectorDoubleToSingle(Vector128<double> Vector)
         {
diff --git a/ChocolArm64/Memory/AMemory.cs b/ChocolArm64/Memory/AMemory.cs
index 2cb9b16c2..bb6a2b549 100644
--- a/ChocolArm64/Memory/AMemory.cs
+++ b/ChocolArm64/Memory/AMemory.cs
@@ -232,7 +232,7 @@ namespace ChocolArm64.Memory
             }
         }
 
-        [MethodImpl(MethodImplOptions.NoInlining)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public Vector128<float> ReadVector32(long Position)
         {
             if (Sse.IsSupported)
@@ -245,7 +245,7 @@ namespace ChocolArm64.Memory
             }
         }
 
-        [MethodImpl(MethodImplOptions.NoInlining)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public Vector128<float> ReadVector64(long Position)
         {
             if (Sse2.IsSupported)
@@ -365,7 +365,7 @@ namespace ChocolArm64.Memory
             }
         }
 
-        [MethodImpl(MethodImplOptions.NoInlining)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public void WriteVector32(long Position, Vector128<float> Value)
         {
             if (Sse.IsSupported)
@@ -378,7 +378,7 @@ namespace ChocolArm64.Memory
             }
         }
 
-        [MethodImpl(MethodImplOptions.NoInlining)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public void WriteVector64(long Position, Vector128<float> Value)
         {
             if (Sse2.IsSupported)