using ChocolArm64.Decoder;
using ChocolArm64.State;
using ChocolArm64.Translation;
using System;
using System.Reflection.Emit;
using System.Runtime.Intrinsics.X86;

using static ChocolArm64.Instruction.AInstEmitSimdHelper;

namespace ChocolArm64.Instruction
{
    static partial class AInstEmit
    {
        public static void Dup_Gp(AILEmitterCtx Context)
        {
            AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;

            if (AOptimizations.UseSse2)
            {
                Context.EmitLdintzr(Op.Rn);

                switch (Op.Size)
                {
                    case 0: Context.Emit(OpCodes.Conv_U1); break;
                    case 1: Context.Emit(OpCodes.Conv_U2); break;
                    case 2: Context.Emit(OpCodes.Conv_U4); break;
                }

                Type[] Types = new Type[] { UIntTypesPerSizeLog2[Op.Size] };

                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), Types));

                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);

                if (Op.RegisterSize == ARegisterSize.SIMD64)
                {
                    EmitVectorZeroUpper(Context, Op.Rd);
                }
            }
            else
            {
                int Bytes = Op.GetBitsCount() >> 3;
                int Elems = Bytes >> Op.Size;

                for (int Index = 0; Index < Elems; Index++)
                {
                    Context.EmitLdintzr(Op.Rn);

                    EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
                }

                if (Op.RegisterSize == ARegisterSize.SIMD64)
                {
                    EmitVectorZeroUpper(Context, Op.Rd);
                }
            }
        }

        public static void Dup_S(AILEmitterCtx Context)
        {
            AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;

            EmitVectorExtractZx(Context, Op.Rn, Op.DstIndex, Op.Size);

            EmitScalarSet(Context, Op.Rd, Op.Size);
        }

        public static void Dup_V(AILEmitterCtx Context)
        {
            AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;

            int Bytes = Op.GetBitsCount() >> 3;
            int Elems = Bytes >> Op.Size;

            for (int Index = 0; Index < Elems; Index++)
            {
                EmitVectorExtractZx(Context, Op.Rn, Op.DstIndex, Op.Size);

                EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
            }

            if (Op.RegisterSize == ARegisterSize.SIMD64)
            {
                EmitVectorZeroUpper(Context, Op.Rd);
            }
        }

        public static void Ext_V(AILEmitterCtx Context)
        {
            AOpCodeSimdExt Op = (AOpCodeSimdExt)Context.CurrOp;

            Context.EmitLdvec(Op.Rd);
            Context.EmitStvectmp();

            int Bytes = Op.GetBitsCount() >> 3;

            int Position = Op.Imm4;

            for (int Index = 0; Index < Bytes; Index++)
            {
                int Reg = Op.Imm4 + Index < Bytes ? Op.Rn : Op.Rm;

                if (Position == Bytes)
                {
                    Position = 0;
                }

                EmitVectorExtractZx(Context, Reg, Position++, 0);
                EmitVectorInsertTmp(Context, Index, 0);
            }

            Context.EmitLdvectmp();
            Context.EmitStvec(Op.Rd);

            if (Op.RegisterSize == ARegisterSize.SIMD64)
            {
                EmitVectorZeroUpper(Context, Op.Rd);
            }
        }

        public static void Fcsel_S(AILEmitterCtx Context)
        {
            AOpCodeSimdFcond Op = (AOpCodeSimdFcond)Context.CurrOp;

            AILLabel LblTrue = new AILLabel();
            AILLabel LblEnd  = new AILLabel();

            Context.EmitCondBranch(LblTrue, Op.Cond);

            EmitVectorExtractF(Context, Op.Rm, 0, Op.Size);

            Context.Emit(OpCodes.Br_S, LblEnd);

            Context.MarkLabel(LblTrue);

            EmitVectorExtractF(Context, Op.Rn, 0, Op.Size);

            Context.MarkLabel(LblEnd);

            EmitScalarSetF(Context, Op.Rd, Op.Size);
        }

        public static void Fmov_Ftoi(AILEmitterCtx Context)
        {
            AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;

            EmitVectorExtractZx(Context, Op.Rn, 0, 3);

            EmitIntZeroUpperIfNeeded(Context);

            Context.EmitStintzr(Op.Rd);
        }

        public static void Fmov_Ftoi1(AILEmitterCtx Context)
        {
            AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;

            EmitVectorExtractZx(Context, Op.Rn, 1, 3);

            EmitIntZeroUpperIfNeeded(Context);

            Context.EmitStintzr(Op.Rd);
        }

        public static void Fmov_Itof(AILEmitterCtx Context)
        {
            AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;

            Context.EmitLdintzr(Op.Rn);

            EmitIntZeroUpperIfNeeded(Context);

            EmitScalarSet(Context, Op.Rd, 3);
        }

        public static void Fmov_Itof1(AILEmitterCtx Context)
        {
            AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;

            Context.EmitLdintzr(Op.Rn);

            EmitIntZeroUpperIfNeeded(Context);

            EmitVectorInsert(Context, Op.Rd, 1, 3);
        }

        public static void Fmov_S(AILEmitterCtx Context)
        {
            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;

            EmitVectorExtractF(Context, Op.Rn, 0, Op.Size);

            EmitScalarSetF(Context, Op.Rd, Op.Size);
        }

        public static void Fmov_Si(AILEmitterCtx Context)
        {
            AOpCodeSimdFmov Op = (AOpCodeSimdFmov)Context.CurrOp;

            Context.EmitLdc_I8(Op.Imm);

            EmitScalarSet(Context, Op.Rd, Op.Size + 2);
        }

        public static void Fmov_V(AILEmitterCtx Context)
        {
            AOpCodeSimdImm Op = (AOpCodeSimdImm)Context.CurrOp;

            int Elems = Op.RegisterSize == ARegisterSize.SIMD128 ? 4 : 2;

            for (int Index = 0; Index < (Elems >> Op.Size); Index++)
            {
                Context.EmitLdc_I8(Op.Imm);

                EmitVectorInsert(Context, Op.Rd, Index, Op.Size + 2);
            }

            if (Op.RegisterSize == ARegisterSize.SIMD64)
            {
                EmitVectorZeroUpper(Context, Op.Rd);
            }
        }

        public static void Ins_Gp(AILEmitterCtx Context)
        {
            AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;

            Context.EmitLdintzr(Op.Rn);

            EmitVectorInsert(Context, Op.Rd, Op.DstIndex, Op.Size);
        }

        public static void Ins_V(AILEmitterCtx Context)
        {
            AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;

            EmitVectorExtractZx(Context, Op.Rn, Op.SrcIndex, Op.Size);

            EmitVectorInsert(Context, Op.Rd, Op.DstIndex, Op.Size);
        }

        public static void Movi_V(AILEmitterCtx Context)
        {
            EmitVectorImmUnaryOp(Context, () => { });
        }

        public static void Mvni_V(AILEmitterCtx Context)
        {
            EmitVectorImmUnaryOp(Context, () => Context.Emit(OpCodes.Not));
        }

        public static void Smov_S(AILEmitterCtx Context)
        {
            AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;

            EmitVectorExtractSx(Context, Op.Rn, Op.DstIndex, Op.Size);

            EmitIntZeroUpperIfNeeded(Context);

            Context.EmitStintzr(Op.Rd);
        }

        public static void Tbl_V(AILEmitterCtx Context)
        {
            AOpCodeSimdTbl Op = (AOpCodeSimdTbl)Context.CurrOp;

            Context.EmitLdvec(Op.Rm);

            for (int Index = 0; Index < Op.Size; Index++)
            {
                Context.EmitLdvec((Op.Rn + Index) & 0x1f);
            }

            switch (Op.Size)
            {
                case 1: AVectorHelper.EmitCall(Context,
                    nameof(AVectorHelper.Tbl1_V64),
                    nameof(AVectorHelper.Tbl1_V128)); break;

                case 2: AVectorHelper.EmitCall(Context,
                    nameof(AVectorHelper.Tbl2_V64),
                    nameof(AVectorHelper.Tbl2_V128)); break;

                case 3: AVectorHelper.EmitCall(Context,
                    nameof(AVectorHelper.Tbl3_V64),
                    nameof(AVectorHelper.Tbl3_V128)); break;

                case 4: AVectorHelper.EmitCall(Context,
                    nameof(AVectorHelper.Tbl4_V64),
                    nameof(AVectorHelper.Tbl4_V128)); break;

                default: throw new InvalidOperationException();
            }

            Context.EmitStvec(Op.Rd);
        }

        public static void Trn1_V(AILEmitterCtx Context)
        {
            EmitVectorTranspose(Context, Part: 0);
        }

        public static void Trn2_V(AILEmitterCtx Context)
        {
            EmitVectorTranspose(Context, Part: 1);
        }

        public static void Umov_S(AILEmitterCtx Context)
        {
            AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;

            EmitVectorExtractZx(Context, Op.Rn, Op.DstIndex, Op.Size);

            Context.EmitStintzr(Op.Rd);
        }

        public static void Uzp1_V(AILEmitterCtx Context)
        {
            EmitVectorUnzip(Context, Part: 0);
        }

        public static void Uzp2_V(AILEmitterCtx Context)
        {
            EmitVectorUnzip(Context, Part: 1);
        }

        public static void Xtn_V(AILEmitterCtx Context)
        {
            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;

            int Elems = 8 >> Op.Size;

            int Part = Op.RegisterSize == ARegisterSize.SIMD128 ? Elems : 0;

            if (AOptimizations.UseSse41 && Op.Size < 2)
            {
                void EmitZeroVector()
                {
                    switch (Op.Size)
                    {
                        case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16Zero)); break;
                        case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32Zero)); break;
                    }
                }

                //For XTN, first operand is source, second operand is 0.
                //For XTN2, first operand is 0, second operand is source.
                if (Part != 0)
                {
                    EmitZeroVector();
                }

                EmitLdvecWithSignedCast(Context, Op.Rn, Op.Size + 1);

                //Set mask to discard the upper half of the wide elements.
                switch (Op.Size)
                {
                    case 0: Context.EmitLdc_I4(0x00ff);     break;
                    case 1: Context.EmitLdc_I4(0x0000ffff); break;
                }

                Type WideType = IntTypesPerSizeLog2[Op.Size + 1];

                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), new Type[] { WideType }));

                WideType = VectorIntTypesPerSizeLog2[Op.Size + 1];

                Type[] WideTypes = new Type[] { WideType, WideType };

                Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), WideTypes));

                if (Part == 0)
                {
                    EmitZeroVector();
                }

                //Pack values with signed saturation, the signed saturation shouldn't
                //saturate anything since the upper bits were masked off.
                Type SseType = Op.Size == 0 ? typeof(Sse2) : typeof(Sse41);

                Context.EmitCall(SseType.GetMethod(nameof(Sse2.PackUnsignedSaturate), WideTypes));

                if (Part != 0)
                {
                    //For XTN2, we additionally need to discard the upper bits
                    //of the target register and OR the result with it.
                    EmitVectorZeroUpper(Context, Op.Rd);

                    EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size);

                    Type NarrowType = VectorUIntTypesPerSizeLog2[Op.Size];

                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), new Type[] { NarrowType, NarrowType }));
                }

                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
            }
            else
            {
                if (Part != 0)
                {
                    Context.EmitLdvec(Op.Rd);
                    Context.EmitStvectmp();
                }

                for (int Index = 0; Index < Elems; Index++)
                {
                    EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1);

                    EmitVectorInsertTmp(Context, Part + Index, Op.Size);
                }

                Context.EmitLdvectmp();
                Context.EmitStvec(Op.Rd);

                if (Part == 0)
                {
                    EmitVectorZeroUpper(Context, Op.Rd);
                }
            }
        }

        public static void Zip1_V(AILEmitterCtx Context)
        {
            EmitVectorZip(Context, Part: 0);
        }

        public static void Zip2_V(AILEmitterCtx Context)
        {
            EmitVectorZip(Context, Part: 1);
        }

        private static void EmitIntZeroUpperIfNeeded(AILEmitterCtx Context)
        {
            if (Context.CurrOp.RegisterSize == ARegisterSize.Int32 ||
                Context.CurrOp.RegisterSize == ARegisterSize.SIMD64)
            {
                Context.Emit(OpCodes.Conv_U4);
                Context.Emit(OpCodes.Conv_U8);
            }
        }

        private static void EmitVectorTranspose(AILEmitterCtx Context, int Part)
        {
            AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;

            int Words = Op.GetBitsCount() >> 4;
            int Pairs = Words >> Op.Size;

            for (int Index = 0; Index < Pairs; Index++)
            {
                int Idx = Index << 1;

                EmitVectorExtractZx(Context, Op.Rn, Idx + Part, Op.Size);
                EmitVectorExtractZx(Context, Op.Rm, Idx + Part, Op.Size);

                EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
                EmitVectorInsertTmp(Context, Idx,     Op.Size);
            }

            Context.EmitLdvectmp();
            Context.EmitStvec(Op.Rd);

            if (Op.RegisterSize == ARegisterSize.SIMD64)
            {
                EmitVectorZeroUpper(Context, Op.Rd);
            }
        }

        private static void EmitVectorUnzip(AILEmitterCtx Context, int Part)
        {
            AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;

            int Words = Op.GetBitsCount() >> 4;
            int Pairs = Words >> Op.Size;

            for (int Index = 0; Index < Pairs; Index++)
            {
                int Idx = Index << 1;

                EmitVectorExtractZx(Context, Op.Rn, Idx + Part, Op.Size);
                EmitVectorExtractZx(Context, Op.Rm, Idx + Part, Op.Size);

                EmitVectorInsertTmp(Context, Pairs + Index, Op.Size);
                EmitVectorInsertTmp(Context,         Index, Op.Size);
            }

            Context.EmitLdvectmp();
            Context.EmitStvec(Op.Rd);

            if (Op.RegisterSize == ARegisterSize.SIMD64)
            {
                EmitVectorZeroUpper(Context, Op.Rd);
            }
        }

        private static void EmitVectorZip(AILEmitterCtx Context, int Part)
        {
            AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;

            if (AOptimizations.UseSse2)
            {
                EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size);
                EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);

                Type[] Types = new Type[]
                {
                    VectorUIntTypesPerSizeLog2[Op.Size],
                    VectorUIntTypesPerSizeLog2[Op.Size]
                };

                string Name = Part == 0 || (Part != 0 && Op.RegisterSize == ARegisterSize.SIMD64)
                    ? nameof(Sse2.UnpackLow)
                    : nameof(Sse2.UnpackHigh);

                Context.EmitCall(typeof(Sse2).GetMethod(Name, Types));

                if (Op.RegisterSize == ARegisterSize.SIMD64 && Part != 0)
                {
                    Context.EmitLdc_I4(8);

                    Type[] ShTypes = new Type[] { VectorUIntTypesPerSizeLog2[Op.Size], typeof(byte) };

                    Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), ShTypes));
                }

                EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);

                if (Op.RegisterSize == ARegisterSize.SIMD64 && Part == 0)
                {
                    EmitVectorZeroUpper(Context, Op.Rd);
                }
            }
            else
            {
                int Words = Op.GetBitsCount() >> 4;
                int Pairs = Words >> Op.Size;

                int Base = Part != 0 ? Pairs : 0;

                for (int Index = 0; Index < Pairs; Index++)
                {
                    int Idx = Index << 1;

                    EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size);
                    EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size);

                    EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
                    EmitVectorInsertTmp(Context, Idx,     Op.Size);
                }

                Context.EmitLdvectmp();
                Context.EmitStvec(Op.Rd);

                if (Op.RegisterSize == ARegisterSize.SIMD64)
                {
                    EmitVectorZeroUpper(Context, Op.Rd);
                }
            }
        }
    }
}