diff --git a/src/jit/Backend.cpp b/src/jit/Backend.cpp index f584d585c..13e47d626 100644 --- a/src/jit/Backend.cpp +++ b/src/jit/Backend.cpp @@ -397,7 +397,7 @@ static void emitSelect128(sljit_compiler*, Instruction*, sljit_s32); static void emitMove(sljit_compiler*, uint32_t type, Operand* from, Operand* to); static ByteCodeStackOffset* emitStoreOntoStack(sljit_compiler* compiler, Operand* param, ByteCodeStackOffset* stackOffset, const ValueTypeVector& types, bool isWordOffsets); -#if (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) || (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) +#if (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) || (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) || (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV && defined __riscv_vector) #define HAS_SIMD #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) @@ -433,7 +433,7 @@ static void simdOperandToArg(sljit_compiler* compiler, Operand* operand, JITArg& arg.argw = 0; } -#endif /* SLJIT_CONFIG_ARM */ +#endif /* SLJIT_CONFIG_ARM || SLJIT_CONFIG_X86 || SLJIT_CONFIG_RISCV */ #include "FloatMathInl.h" @@ -456,6 +456,8 @@ static void simdOperandToArg(sljit_compiler* compiler, Operand* operand, JITArg& #include "SimdArm64Inl.h" #elif (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) #include "SimdArm32Inl.h" +#elif (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV && defined __riscv_vector) +#include "SimdRiscvInl.h" #endif /* SLJIT_CONFIG_ARM */ #ifdef HAS_SIMD @@ -1028,6 +1030,9 @@ JITCompiler::JITCompiler(Module* module, uint32_t JITFlags) , m_options(0) , m_savedIntegerRegCount(0) , m_savedFloatRegCount(0) +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + , m_savedVectorRegCount(0) +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ , m_stackTmpSize(0) { if (module->m_jitModule != nullptr) { @@ -1530,9 +1535,13 @@ void JITCompiler::emitProlog() ASSERT(m_stackTmpSize <= 16); #endif /* SLJIT_CONFIG_ARM_32 */ - sljit_emit_enter(m_compiler, options, SLJIT_ARGS1(P, P_R), - SLJIT_NUMBER_OF_SCRATCH_REGISTERS | SLJIT_ENTER_FLOAT(SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS), - (m_savedIntegerRegCount + 2) | SLJIT_ENTER_FLOAT(m_savedFloatRegCount), m_context.stackTmpStart + m_stackTmpSize); + sljit_s32 scratches = SLJIT_NUMBER_OF_SCRATCH_REGISTERS | SLJIT_ENTER_FLOAT(SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS) | SLJIT_ENTER_VECTOR(SLJIT_NUMBER_OF_SCRATCH_VECTOR_REGISTERS); +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + sljit_s32 saveds = (m_savedIntegerRegCount + 2) | SLJIT_ENTER_FLOAT(m_savedFloatRegCount) | SLJIT_ENTER_VECTOR(m_savedVectorRegCount); +#else /* !SLJIT_SEPARATE_VECTOR_REGISTERS */ + sljit_s32 saveds = (m_savedIntegerRegCount + 2) | SLJIT_ENTER_FLOAT(m_savedFloatRegCount) | SLJIT_ENTER_VECTOR(m_savedFloatRegCount); +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ + sljit_emit_enter(m_compiler, options, SLJIT_ARGS1(P, P_R), scratches, saveds, m_context.stackTmpStart + m_stackTmpSize); sljit_emit_op1(m_compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), kContextOffset, SLJIT_R0, 0); diff --git a/src/jit/ByteCodeParser.cpp b/src/jit/ByteCodeParser.cpp index 10673c52d..a77364f63 100644 --- a/src/jit/ByteCodeParser.cpp +++ b/src/jit/ByteCodeParser.cpp @@ -143,66 +143,66 @@ static bool isFloatGlobal(uint32_t globalIndex, Module* module) // [TEMPORARY_TYPE] | S0 | S2 : A temporary register is required, which can // be the same as the first or third source operands -#define OPERAND_TYPE_LIST \ - OL2(OTOp1I32, /* SD */ I32, I32 | S0) \ - OL3(OTOp2I32, /* SSD */ I32, I32, I32 | S0 | S1) \ - OL2(OTOp1I64, /* SD */ I64, I64 | S0) \ - OL2(OTOp1F32, /* SD */ F32, F32 | S0) \ - OL2(OTOp1F64, /* SD */ F64, F64 | S0) \ - OL3(OTOp2F32, /* SSD */ F32, F32, F32 | S0 | S1) \ - OL3(OTOp2F64, /* SSD */ F64, F64, F64 | S0 | S1) \ - OL1(OTGetI32, /* S */ I32) \ - OL1(OTPutI32, /* D */ I32) \ - OL1(OTPutI64, /* D */ I64) \ - OL1(OTPutV128, /* D */ V128) \ - OL1(OTPutPTR, /* D */ PTR) \ - OL2(OTMoveF32, /* SD */ F32 | NOTMP, F32 | S0) \ - OL2(OTMoveF64, /* SD */ F64 | NOTMP, F64 | S0) \ - OL2(OTMoveV128, /* SD */ V128, V128 | S0) \ - OL2(OTI32ReinterpretF32, /* SD */ F32, I32) \ - OL2(OTI64ReinterpretF64, /* SD */ F64, I64) \ - OL2(OTF32ReinterpretI32, /* SD */ I32, F32) \ - OL2(OTF64ReinterpretI64, /* SD */ I64, F64) \ - OL2(OTEqzI64, /* SD */ I64, I32) \ - OL3(OTCompareI64, /* SSD */ I64, I64, I32) \ - OL3(OTCompareF32, /* SSD */ F32, F32, I32) \ - OL3(OTCompareF64, /* SSD */ F64, F64, I32) \ - OL3(OTCopySignF32, /* SSD */ F32, F32, F32 | TMP | S0 | S1) \ - OL3(OTCopySignF64, /* SSD */ F64, F64, F64 | TMP | S0 | S1) \ - OL2(OTDemoteF64, /* SD */ F64, F32 | S0) \ - OL2(OTPromoteF32, /* SD */ F32, F64 | S0) \ - OL4(OTLoadI32, /* SDTT */ I32, I32 | S0, PTR, I32 | S0) \ - OL4(OTLoadF32, /* SDTT */ I32, F32, PTR, I32 | S0) \ - OL4(OTLoadF64, /* SDTT */ I32, F64, PTR, I32 | S0) \ - OL4(OTLoadV128, /* SDTT */ I32, V128 | TMP, PTR, I32 | S0) \ - OL5(OTLoadLaneV128, /* SSDTTT */ I32, V128 | NOTMP, V128 | TMP | S1, PTR, I32 | S0) \ - OL5(OTStoreI32, /* SSTTT */ I32, I32, PTR, I32 | S0, I32 | S1) \ - OL4(OTStoreF32, /* SSTT */ I32, F32 | NOTMP, PTR, I32 | S0) \ - OL5(OTStoreI64, /* SSTTT */ I32, I64, PTR, I32 | S0, PTR | S1) \ - OL4(OTStoreF64, /* SSTT */ I32, F64 | NOTMP, PTR, I32 | S0) \ - OL4(OTStoreV128, /* SSTT */ I32, V128 | TMP, PTR, I32 | S0) \ - OL3(OTCallback3Arg, /* SSS */ I32, I32, I32) \ - OL3(OTTableGrow, /* SSD */ I32, PTR, I32 | S0 | S1) \ - OL4(OTTableSet, /* SSTT */ I32, PTR, I32 | S0, PTR) \ - OL3(OTTableGet, /* SDT */ I32, PTR | TMP | S0, I32) \ - OL1(OTGlobalGetF32, /* D */ F32) \ - OL1(OTGlobalGetF64, /* D */ F64) \ - OL2(OTGlobalSetI32, /* ST */ I32, PTR) \ - OL2(OTGlobalSetI64, /* ST */ I64, PTR) \ - OL1(OTGlobalSetF32, /* S */ F32 | NOTMP) \ - OL1(OTGlobalSetF64, /* S */ F64 | NOTMP) \ - OL2(OTConvertInt32FromInt64, /* SD */ I64, I32) \ - OL2(OTConvertInt64FromInt32, /* SD */ I32, I64) \ - OL2(OTConvertInt32FromFloat32, /* SD */ F32 | TMP, I32 | TMP) \ - OL2(OTConvertInt32FromFloat64, /* SD */ F64 | TMP, I32 | TMP) \ - OL2(OTConvertInt64FromFloat32Callback, /* SD */ F32, I64) \ - OL2(OTConvertInt64FromFloat64Callback, /* SD */ F64, I64) \ - OL2(OTConvertFloat32FromInt32, /* SD */ I32, F32) \ - OL2(OTConvertFloat64FromInt32, /* SD */ I32, F64) \ - OL2(OTConvertFloat32FromInt64, /* SD */ I64, F32) \ - OL2(OTConvertFloat64FromInt64, /* SD */ I64, F64) \ - OL4(OTSelectI32, /* SSSD */ I32, I32, I32, I32 | S0 | S1) \ - OL4(OTSelectF32, /* SSSD */ F32, F32, I32, F32 | S0 | S1) \ +#define OPERAND_TYPE_LIST \ + OL2(OTOp1I32, /* SD */ I32, I32 | S0) \ + OL3(OTOp2I32, /* SSD */ I32, I32, I32 | S0 | S1) \ + OL2(OTOp1I64, /* SD */ I64, I64 | S0) \ + OL2(OTOp1F32, /* SD */ F32, F32 | S0) \ + OL2(OTOp1F64, /* SD */ F64, F64 | S0) \ + OL3(OTOp2F32, /* SSD */ F32, F32, F32 | S0 | S1) \ + OL3(OTOp2F64, /* SSD */ F64, F64, F64 | S0 | S1) \ + OL1(OTGetI32, /* S */ I32) \ + OL1(OTPutI32, /* D */ I32) \ + OL1(OTPutI64, /* D */ I64) \ + OL1(OTPutV128, /* D */ V128) \ + OL1(OTPutPTR, /* D */ PTR) \ + OL2(OTMoveF32, /* SD */ F32 | NOTMP, F32 | S0) \ + OL2(OTMoveF64, /* SD */ F64 | NOTMP, F64 | S0) \ + OL2(OTMoveV128, /* SD */ V128, V128 | S0) \ + OL2(OTI32ReinterpretF32, /* SD */ F32, I32) \ + OL2(OTI64ReinterpretF64, /* SD */ F64, I64) \ + OL2(OTF32ReinterpretI32, /* SD */ I32, F32) \ + OL2(OTF64ReinterpretI64, /* SD */ I64, F64) \ + OL2(OTEqzI64, /* SD */ I64, I32) \ + OL3(OTCompareI64, /* SSD */ I64, I64, I32) \ + OL3(OTCompareF32, /* SSD */ F32, F32, I32) \ + OL3(OTCompareF64, /* SSD */ F64, F64, I32) \ + OL3(OTCopySignF32, /* SSD */ F32, F32, F32 | TMP | S0 | S1) \ + OL3(OTCopySignF64, /* SSD */ F64, F64, F64 | TMP | S0 | S1) \ + OL2(OTDemoteF64, /* SD */ F64, F32 | S0) \ + OL2(OTPromoteF32, /* SD */ F32, F64 | S0) \ + OL4(OTLoadI32, /* SDTT */ I32, I32 | S0, PTR, I32 | S0) \ + OL4(OTLoadF32, /* SDTT */ I32, F32, PTR, I32 | S0) \ + OL4(OTLoadF64, /* SDTT */ I32, F64, PTR, I32 | S0) \ + OL4(OTLoadV128, /* SDTT */ I32, V128 | TMP, PTR, I32 | S0) \ + OL5(OTLoadLaneV128, /* SSDTT */ I32, V128 | NOTMP, V128 | TMP | S1, PTR, I32 | S0) \ + OL5(OTStoreI32, /* SSTTT */ I32, I32, PTR, I32 | S0, I32 | S1) \ + OL4(OTStoreF32, /* SSTT */ I32, F32 | NOTMP, PTR, I32 | S0) \ + OL5(OTStoreI64, /* SSTTT */ I32, I64, PTR, I32 | S0, PTR | S1) \ + OL4(OTStoreF64, /* SSTT */ I32, F64 | NOTMP, PTR, I32 | S0) \ + OL4(OTStoreV128, /* SSTT */ I32, V128 | TMP, PTR, I32 | S0) \ + OL3(OTCallback3Arg, /* SSS */ I32, I32, I32) \ + OL3(OTTableGrow, /* SSD */ I32, PTR, I32 | S0 | S1) \ + OL4(OTTableSet, /* SSTT */ I32, PTR, I32 | S0, PTR) \ + OL3(OTTableGet, /* SDT */ I32, PTR | TMP | S0, I32) \ + OL1(OTGlobalGetF32, /* D */ F32) \ + OL1(OTGlobalGetF64, /* D */ F64) \ + OL2(OTGlobalSetI32, /* ST */ I32, PTR) \ + OL2(OTGlobalSetI64, /* ST */ I64, PTR) \ + OL1(OTGlobalSetF32, /* S */ F32 | NOTMP) \ + OL1(OTGlobalSetF64, /* S */ F64 | NOTMP) \ + OL2(OTConvertInt32FromInt64, /* SD */ I64, I32) \ + OL2(OTConvertInt64FromInt32, /* SD */ I32, I64) \ + OL2(OTConvertInt32FromFloat32, /* SD */ F32 | TMP, I32 | TMP) \ + OL2(OTConvertInt32FromFloat64, /* SD */ F64 | TMP, I32 | TMP) \ + OL2(OTConvertInt64FromFloat32Callback, /* SD */ F32, I64) \ + OL2(OTConvertInt64FromFloat64Callback, /* SD */ F64, I64) \ + OL2(OTConvertFloat32FromInt32, /* SD */ I32, F32) \ + OL2(OTConvertFloat64FromInt32, /* SD */ I32, F64) \ + OL2(OTConvertFloat32FromInt64, /* SD */ I64, F32) \ + OL2(OTConvertFloat64FromInt64, /* SD */ I64, F64) \ + OL4(OTSelectI32, /* SSSD */ I32, I32, I32, I32 | S0 | S1) \ + OL4(OTSelectF32, /* SSSD */ F32, F32, I32, F32 | S0 | S1) \ OL4(OTSelectF64, /* SSSD */ F64, F64, I32, F64 | S0 | S1) #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) @@ -322,6 +322,24 @@ static bool isFloatGlobal(uint32_t globalIndex, Module* module) #define OTShiftV128Tmp OTShiftV128 #define OTOp3DotAddV128 OTOp3V128 +#elif (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) +#define OPERAND_TYPE_LIST_SIMD_ARCH \ + OL2(OTOp1V128CB, /* SD */ V128 | NOTMP, V128 | NOTMP) \ + OL3(OTOp2V128, /* SSD */ V128 | TMP, V128 | TMP, V128 | TMP | S0 | S1) \ + OL3(OTOp1V128Tmp, /* SDT */ V128 | NOTMP, V128 | TMP | S0, V128) \ + OL3(OTSwizzleV128, /* SSD */ V128 | TMP, V128 | NOTMP, V128 | TMP | S1) \ + OL3(OTShuffleV128, /* SSD */ V128 | TMP, V128 | TMP, V128 | TMP) \ + OL3(OTShiftV128, /* SSD */ V128 | NOTMP, I32, V128 | TMP | S0) + +// List of aliases. +#define OTOp2V128Rev OTOp2V128 +#define OTOp2V128Tmp OTOp2V128 +#define OTMinMaxV128 OTOp2V128 +#define OTPMinMaxV128 OTOp2V128 +#define OTPopcntV128 OTOp1V128Tmp +#define OTShiftV128Tmp OTShiftV128 +#define OTOp3DotAddV128 OTOp3V128 + #endif /* SLJIT_CONFIG_ARM */ // Constructing read-only operand descriptors. diff --git a/src/jit/Compiler.h b/src/jit/Compiler.h index a97d45ac7..5f8dfea02 100644 --- a/src/jit/Compiler.h +++ b/src/jit/Compiler.h @@ -832,6 +832,9 @@ class JITCompiler { uint32_t m_options; uint8_t m_savedIntegerRegCount; uint8_t m_savedFloatRegCount; +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + uint8_t m_savedVectorRegCount; +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ uint8_t m_stackTmpSize; std::vector m_tryBlocks; diff --git a/src/jit/InstList.cpp b/src/jit/InstList.cpp index b52100d79..7311f61ee 100644 --- a/src/jit/InstList.cpp +++ b/src/jit/InstList.cpp @@ -425,8 +425,14 @@ void JITCompiler::dump() prefix = "F"; savedStart = SLJIT_FR(SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS); savedEnd = SLJIT_FS0; +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + if ((variable.info & Instruction::TypeMask) == Instruction::V128Operand) { + prefix = "V"; + savedStart = SLJIT_VR(SLJIT_NUMBER_OF_SCRATCH_VECTOR_REGISTERS); + savedEnd = SLJIT_VS1; + } +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ } - uint32_t reg1 = static_cast(VARIABLE_GET_REF(variable.value)); #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) diff --git a/src/jit/RegisterAlloc.cpp b/src/jit/RegisterAlloc.cpp index 151e2eb8a..f562d850b 100644 --- a/src/jit/RegisterAlloc.cpp +++ b/src/jit/RegisterAlloc.cpp @@ -20,6 +20,25 @@ #include "jit/Compiler.h" #include +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) +#define VECTOR_SELECT(COND, VECTOR, FLOAT) \ + if (COND) { \ + VECTOR; \ + } else { \ + FLOAT; \ + } +#else /* !SLJIT_SEPARATE_VECTOR_REGISTERS */ +#define VECTOR_SELECT(COND, VECTOR, FLOAT) FLOAT; +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ + +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) +#if (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) +#define FIRST_VECTOR_REG SLJIT_VR1 +#else /* !SLJIT_CONFIG_RISCV */ +#define FIRST_VECTOR_REG SLJIT_VR0 +#endif /* SLJIT_CONFIG_RISCV */ +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ + namespace Walrus { class RegisterSet { @@ -36,8 +55,8 @@ class RegisterSet { uint8_t getSavedRegCount() { return static_cast(m_usedSavedRegisters - m_savedStartIndex); } - uint8_t toCPUReg(uint8_t reg); - bool check(int8_t reg, uint16_t constraints); + uint8_t toCPUReg(uint8_t reg, uint8_t scratchBase, uint8_t savedBase); + bool check(uint8_t reg, uint16_t constraints); void freeUnusedRegisters(size_t id); uint8_t allocateRegister(VariableList::Variable* variable); #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) @@ -87,26 +106,50 @@ class RegisterSet { class RegisterFile { public: - RegisterFile(uint32_t numberOfIntegerScratchRegs, uint32_t numberOfIntegerSavedRegs, - uint32_t numberOfFloatScratchRegs, uint32_t numberOfFloatSavedRegs) + RegisterFile(uint32_t numberOfIntegerScratchRegs, uint32_t numberOfIntegerSavedRegs) : m_integerSet(numberOfIntegerScratchRegs, numberOfIntegerSavedRegs, true) - , m_floatSet(numberOfFloatScratchRegs, numberOfFloatSavedRegs, false) + , m_floatSet(SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS, SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS, false) +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) +#if (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) + , m_vectorSet(SLJIT_NUMBER_OF_SCRATCH_VECTOR_REGISTERS - 1, SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS, false) +#else /* !SLJIT_CONFIG_RISCV */ + , m_vectorSet(SLJIT_NUMBER_OF_SCRATCH_VECTOR_REGISTERS, SLJIT_NUMBER_OF_SAVED_VECTOR_REGISTERS, false) +#endif /* SLJIT_CONFIG_RISCV */ +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ { } - RegisterSet& integerSet() { return m_integerSet; } - RegisterSet& floatSet() { return m_floatSet; } - + RegisterSet& integerSet() + { + return m_integerSet; + } + RegisterSet& floatSet() + { + return m_floatSet; + } +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + RegisterSet& vectorSet() + { + return m_vectorSet; + } +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ uint8_t toCPUIntegerReg(uint8_t reg) { - return m_integerSet.toCPUReg(reg); + return m_integerSet.toCPUReg(reg, SLJIT_R0, SLJIT_S2); } uint8_t toCPUFloatReg(uint8_t reg) { - return m_floatSet.toCPUReg(reg); + return m_floatSet.toCPUReg(reg, SLJIT_FR0, SLJIT_FS0); } +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + uint8_t toCPUVectorReg(uint8_t reg) + { + return m_vectorSet.toCPUReg(reg, FIRST_VECTOR_REG, SLJIT_VS0); + } +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ + void integerReserve(uint8_t reg) { m_integerSet.reserve(reg); @@ -117,6 +160,13 @@ class RegisterFile { m_floatSet.reserve(reg); } +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + void vectorReserve(uint8_t reg) + { + m_vectorSet.reserve(reg); + } +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ + void allocateVariable(VariableList::Variable* variable) { uint8_t type = variable->info & Instruction::TypeMask; @@ -124,11 +174,17 @@ class RegisterFile { if (type & Instruction::FloatOperandMarker) { #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) - if ((variable->info & Instruction::TypeMask) == Instruction::V128Operand) { + if (type == Instruction::V128Operand) { m_floatSet.allocateQuadRegister(variable); return; } #endif /* SLJIT_CONFIG_ARM_32 */ +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + if (type == Instruction::V128Operand) { + m_vectorSet.allocateRegister(variable); + return; + } +#endif m_floatSet.allocateRegister(variable); return; } @@ -147,6 +203,9 @@ class RegisterFile { { m_integerSet.freeUnusedRegisters(id); m_floatSet.freeUnusedRegisters(id); +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + m_vectorSet.freeUnusedRegisters(id); +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ } bool reuseResult(uint8_t type, VariableList::Variable** reusableRegs, VariableList::Variable* resultVariable); @@ -154,6 +213,9 @@ class RegisterFile { private: RegisterSet m_integerSet; RegisterSet m_floatSet; +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + RegisterSet m_vectorSet; +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ }; RegisterSet::RegisterSet(uint32_t numberOfScratchRegs, uint32_t numberOfSavedRegs, bool isInteger) @@ -164,17 +226,16 @@ RegisterSet::RegisterSet(uint32_t numberOfScratchRegs, uint32_t numberOfSavedReg m_registers.resize(numberOfScratchRegs + numberOfSavedRegs); } -uint8_t RegisterSet::toCPUReg(uint8_t reg) +uint8_t RegisterSet::toCPUReg(uint8_t reg, uint8_t scratchBase, uint8_t savedBase) { if (reg < m_savedStartIndex) { - return SLJIT_R0 + reg; + return scratchBase + reg; } - uint8_t base = (m_regStatus & kIsInteger) ? SLJIT_S2 : SLJIT_FS0; - return base - (reg - m_savedStartIndex); + return savedBase - (reg - m_savedStartIndex); } -bool RegisterSet::check(int8_t reg, uint16_t constraints) +bool RegisterSet::check(uint8_t reg, uint16_t constraints) { if (constraints & VariableList::kIsCallback) { return reg >= m_savedStartIndex; @@ -528,16 +589,21 @@ bool RegisterFile::reuseResult(uint8_t type, VariableList::Variable** reusableRe } uint16_t constraints = resultVariable->info; - RegisterSet& registers = (type & Instruction::FloatOperandMarker) ? m_floatSet : m_integerSet; + RegisterSet* registers = (type & Instruction::FloatOperandMarker) ? &m_floatSet : &m_integerSet; +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + if ((type & Instruction::TypeMask) == Instruction::V128Operand) { + registers = &m_vectorSet; + } +#endif #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) bool isInt64 = (type & Instruction::TypeMask) == Instruction::Int64Operand; #endif /* SLJIT_32BIT_ARCHITECTURE */ for (uint32_t i = 0; i < 3; i++) { VariableList::Variable* variable = reusableRegs[i]; - if ((type & (Instruction::Src0Allowed << i)) && variable != nullptr && registers.check(variable->reg1, constraints)) { + if ((type & (Instruction::Src0Allowed << i)) && variable != nullptr && registers->check(variable->reg1, constraints)) { #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) - if (isInt64 && !registers.check(variable->reg2, constraints)) { + if (isInt64 && !registers->check(variable->reg2, constraints)) { continue; } #endif /* SLJIT_32BIT_ARCHITECTURE */ @@ -545,11 +611,11 @@ bool RegisterFile::reuseResult(uint8_t type, VariableList::Variable** reusableRe reusableRegs[i] = nullptr; resultVariable->reg1 = variable->reg1; - registers.updateVariable(resultVariable->reg1, resultVariable); + registers->updateVariable(resultVariable->reg1, resultVariable); #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) resultVariable->reg2 = variable->reg2; - registers.updateVariable(resultVariable->reg2, resultVariable); + registers->updateVariable(resultVariable->reg2, resultVariable); #endif /* SLJIT_32BIT_ARCHITECTURE */ return true; } @@ -563,6 +629,9 @@ void JITCompiler::allocateRegisters() if (m_variableList == nullptr) { m_savedIntegerRegCount = 0; m_savedFloatRegCount = 0; +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + m_savedVectorRegCount = 0; +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ return; } @@ -574,8 +643,7 @@ void JITCompiler::allocateRegisters() const uint32_t numberOfsavedRegs = SLJIT_NUMBER_OF_SAVED_REGISTERS - 2; #endif /* SLJIT_CONFIG_X86_32 */ - RegisterFile regs(numberOfscratchRegs, numberOfsavedRegs, - SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS, SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS); + RegisterFile regs(numberOfscratchRegs, numberOfsavedRegs); size_t variableListParamCount = m_variableList->paramCount; for (size_t i = 0; i < variableListParamCount; i++) { @@ -672,7 +740,7 @@ void JITCompiler::allocateRegisters() if (reg != VariableList::kUnusedReg) { ASSERT(!(variable->info & VariableList::kIsImmediate)); - regs.floatReserve(reg); + VECTOR_SELECT((*list & Instruction::TypeMask) == Instruction::V128Operand, regs.vectorReserve(reg), regs.floatReserve(reg)) #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) if ((*list & Instruction::TypeMask) != Instruction::V128Operand) { regs.floatReserve(reg + 1); @@ -684,12 +752,12 @@ void JITCompiler::allocateRegisters() reg = regs.floatSet().allocateQuadRegister(nullptr); } else { #endif /* SLJIT_CONFIG_ARM_32 */ - reg = regs.floatSet().allocateRegister(nullptr); + VECTOR_SELECT((*list & Instruction::TypeMask) == Instruction::V128Operand, reg = regs.vectorSet().allocateRegister(nullptr), reg = regs.floatSet().allocateRegister(nullptr)) #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) } #endif /* SLJIT_CONFIG_ARM_32 */ } - instr->setRequiredReg(tmpIndex, regs.toCPUFloatReg(reg)); + VECTOR_SELECT((*list & Instruction::TypeMask) == Instruction::V128Operand, instr->setRequiredReg(tmpIndex, regs.toCPUVectorReg(reg)), instr->setRequiredReg(tmpIndex, regs.toCPUFloatReg(reg))) } tmpIndex++; } @@ -717,7 +785,7 @@ void JITCompiler::allocateRegisters() if (type & Instruction::FloatOperandMarker) { if (resultVariable->reg1 != VariableList::kUnusedReg) { - regs.floatReserve(resultVariable->reg1); + VECTOR_SELECT(type == Instruction::V128Operand, regs.vectorReserve(resultVariable->reg1), regs.floatReserve(resultVariable->reg1)) #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) regs.floatReserve(resultVariable->reg2); #endif /* SLJIT_CONFIG_ARM_32 */ @@ -739,11 +807,11 @@ void JITCompiler::allocateRegisters() resultReg = regs.floatSet().allocateQuadRegister(nullptr); } else { #endif /* SLJIT_CONFIG_ARM_32 */ - resultReg = regs.floatSet().allocateRegister(nullptr); + VECTOR_SELECT(type == Instruction::V128Operand, resultReg = regs.vectorSet().allocateRegister(nullptr), resultReg = regs.floatSet().allocateRegister(nullptr)) #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) } #endif /* SLJIT_CONFIG_ARM_32 */ - resultReg = regs.toCPUFloatReg(resultReg); + VECTOR_SELECT(type == Instruction::V128Operand, resultReg = regs.toCPUVectorReg(resultReg), resultReg = regs.toCPUFloatReg(resultReg)) #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) } else if (type == Instruction::Int64Operand) { uint8_t otherReg; @@ -761,8 +829,9 @@ void JITCompiler::allocateRegisters() regs.floatReserve(resultReg + 1); } #endif /* SLJIT_CONFIG_ARM_32 */ - regs.floatReserve(resultReg); - resultReg = regs.toCPUFloatReg(resultReg); + VECTOR_SELECT(type == Instruction::V128Operand, + (regs.vectorReserve(resultReg), resultReg = regs.toCPUVectorReg(resultReg)), + (regs.floatReserve(resultReg), resultReg = regs.toCPUFloatReg(resultReg))) } else { #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) if (type == Instruction::Int64Operand) { @@ -799,9 +868,9 @@ void JITCompiler::allocateRegisters() regs.floatReserve(reg + 1); } #endif /* SLJIT_CONFIG_ARM_32 */ - - regs.floatReserve(reg); - instr->setRequiredReg(reuseTmpIndex, regs.toCPUFloatReg(reg)); + VECTOR_SELECT((*nextType & Instruction::TypeMask) == Instruction::V128Operand, + (regs.vectorReserve(reg), instr->setRequiredReg(reuseTmpIndex, regs.toCPUVectorReg(reg))), + (regs.floatReserve(reg), instr->setRequiredReg(reuseTmpIndex, regs.toCPUFloatReg(reg)))) } else { regs.integerReserve(reg); instr->setRequiredReg(reuseTmpIndex, regs.toCPUIntegerReg(reg)); @@ -819,7 +888,7 @@ void JITCompiler::allocateRegisters() if (instr->requiredReg(tmpIndex) == 0) { if (*list & Instruction::FloatOperandMarker) { - instr->setRequiredReg(tmpIndex, regs.toCPUFloatReg(regs.floatSet().allocateRegister(nullptr))); + VECTOR_SELECT((*list & Instruction::TypeMask) == Instruction::V128Operand, instr->setRequiredReg(tmpIndex, regs.toCPUVectorReg(regs.vectorSet().allocateRegister(nullptr))), instr->setRequiredReg(tmpIndex, regs.toCPUFloatReg(regs.floatSet().allocateRegister(nullptr)))) } else { instr->setRequiredReg(tmpIndex, regs.toCPUIntegerReg(regs.integerSet().allocateRegister(nullptr))); } @@ -850,6 +919,9 @@ void JITCompiler::allocateRegisters() m_savedIntegerRegCount = regs.integerSet().getSavedRegCount(); m_savedFloatRegCount = regs.floatSet().getSavedRegCount(); +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + m_savedVectorRegCount = regs.vectorSet().getSavedRegCount(); +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ // Insert stack inits before the offsets are destroyed. insertStackInitList(nullptr, 0, variableListParamCount); @@ -867,7 +939,7 @@ void JITCompiler::allocateRegisters() uint8_t reg1; if (variable.info & Instruction::FloatOperandMarker) { - reg1 = regs.toCPUFloatReg(variable.reg1); + VECTOR_SELECT(variable.info == Instruction::V128Operand, reg1 = regs.toCPUVectorReg(variable.reg1), reg1 = regs.toCPUFloatReg(variable.reg1)) } else { reg1 = regs.toCPUIntegerReg(variable.reg1); @@ -896,6 +968,9 @@ void JITCompiler::allocateRegistersSimple() { m_savedIntegerRegCount = 0; m_savedFloatRegCount = 0; +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + m_savedVectorRegCount = 0; +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ if (m_variableList == nullptr) { return; @@ -916,6 +991,9 @@ void JITCompiler::allocateRegistersSimple() uint32_t tmpIndex = 0; uint32_t nextIntIndex = SLJIT_R0; uint32_t nextFloatIndex = SLJIT_FR0; +#if (defined SLJIT_SEPARATE_VECTOR_REGISTERS && SLJIT_SEPARATE_VECTOR_REGISTERS) + uint32_t nextVectorIndex = FIRST_VECTOR_REG; +#endif /* SLJIT_SEPARATE_VECTOR_REGISTERS */ instr->setRequiredRegsDescriptor(0); @@ -944,7 +1022,7 @@ void JITCompiler::allocateRegistersSimple() // Source registers are read-only. if ((*list & Instruction::TmpRequired) || (variable.info & VariableList::kIsImmediate)) { - instr->setRequiredReg(tmpIndex, nextFloatIndex++); + VECTOR_SELECT((*list & Instruction::TypeMask) == Instruction::V128Operand, instr->setRequiredReg(tmpIndex, nextVectorIndex++), instr->setRequiredReg(tmpIndex, nextFloatIndex++)) #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) if ((*list & Instruction::TypeMask) == Instruction::V128Operand) { // Quad registers are register pairs. @@ -980,7 +1058,7 @@ void JITCompiler::allocateRegistersSimple() } if (*list & Instruction::FloatOperandMarker) { - instr->setRequiredReg(tmpIndex, nextFloatIndex++); + VECTOR_SELECT((*list & Instruction::TypeMask) == Instruction::V128Operand, instr->setRequiredReg(tmpIndex, nextVectorIndex++), instr->setRequiredReg(tmpIndex, nextFloatIndex++)) #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) if ((*list & Instruction::TypeMask) == Instruction::V128Operand) { // Quad registers are register pairs. @@ -1057,4 +1135,5 @@ void JITCompiler::freeVariables() } // namespace Walrus +#undef VECTOR_SELECT #endif // WALRUS_ENABLE_JIT diff --git a/src/jit/SimdRiscvInl.h b/src/jit/SimdRiscvInl.h new file mode 100644 index 000000000..9ee577bf5 --- /dev/null +++ b/src/jit/SimdRiscvInl.h @@ -0,0 +1,1253 @@ +/* + * Copyright (c) 2022-present Samsung Electronics Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Only included by Backend.cpp */ + +#define OPCODE(x) (((uint32_t)x) << 26) + +namespace SimdOp { + +enum InstructionType : uint32_t { + vm = 1 << 25, + simd = 0x57 | vm, + opfvf = (0x5 << 12) | simd, + opfvv = (0x1 << 12) | simd, + opivi = (0x3 << 12) | simd, + opivv = (0x0 << 12) | simd, + opivx = (0x4 << 12) | simd, + opmvv = (0x2 << 12) | simd, + opmvx = (0x6 << 12) | simd +}; + +enum TypeOpcode : uint32_t { + vaaddu_vv = InstructionType::opmvv | OPCODE(0x8), + vadd_vi = InstructionType::opivi | OPCODE(0x0), + vadd_vv = InstructionType::opivv | OPCODE(0x0), + vand_vv = InstructionType::opivv | OPCODE(0x9), + vcompress_vm = InstructionType::opmvv | OPCODE(0x17), +#if defined(__riscv_zvbb) + vcpop_v = InstructionType::opmvv | OPCODE(0x12) | (0xE << 15), +#endif + vfadd_vf = InstructionType::opfvf | OPCODE(0x0), + vfadd_vv = InstructionType::opfvv | OPCODE(0x0), + vfcvt_f_x_v = InstructionType::opfvv | OPCODE(0x12) | (0x3 << 15), + vfcvt_x_f_v = InstructionType::opfvv | OPCODE(0x12) | (0x1 << 15), + vfcvt_rtz_x_f_v = InstructionType::opfvv | OPCODE(0x12) | (0x7 << 15), + vfdiv_vv = InstructionType::opfvv | OPCODE(0x20), + vfirst_m = InstructionType::opmvv | OPCODE(0x10) | (0x11 << 15), + vfmax_vv = InstructionType::opfvv | OPCODE(0x6), + vfmin_vv = InstructionType::opfvv | OPCODE(0x4), + vfmul_vv = InstructionType::opfvv | OPCODE(0x24), + vfsgnj_vv = InstructionType::opfvv | OPCODE(0x8), + vfsgnjn_vv = InstructionType::opfvv | OPCODE(0x9), + vfsgnjx_vv = InstructionType::opfvv | OPCODE(0xA), + vfsqrt_v = InstructionType::opfvv | OPCODE(0x13), + vfsub_vv = InstructionType::opfvv | OPCODE(0x2), + vmax_vv = InstructionType::opivv | OPCODE(0x7), + vmaxu_vv = InstructionType::opivv | OPCODE(0x6), + vmerge_vi = (InstructionType::opivi ^ InstructionType::vm) | OPCODE(0x17), + vmerge_vv = (InstructionType::opivv ^ InstructionType::vm) | OPCODE(0x17), + vmfeq_vv = InstructionType::opfvv | OPCODE(0x18), + vmfle_vv = InstructionType::opfvv | OPCODE(0x19), + vmflt_vv = InstructionType::opfvv | OPCODE(0x1B), + vmfne_vv = InstructionType::opfvv | OPCODE(0x1C), + vmin_vv = InstructionType::opivv | OPCODE(0x5), + vminu_vv = InstructionType::opivv | OPCODE(0x4), + vmseq_vv = InstructionType::opivv | OPCODE(0x18), + vmsle_vv = InstructionType::opivv | OPCODE(0x1D), + vmsleu_vv = InstructionType::opivv | OPCODE(0x1C), + vmslt_vv = InstructionType::opivv | OPCODE(0x1B), + vmslt_vx = InstructionType::opivx | OPCODE(0x1B), + vmsltu_vv = InstructionType::opivv | OPCODE(0x1A), + vmsne_vi = InstructionType::opivi | OPCODE(0x19), + vmsne_vv = InstructionType::opivv | OPCODE(0x19), + vmul_vv = InstructionType::opmvv | OPCODE(0x25), + vmv_sx = InstructionType::opmvx | OPCODE(0x10), + vmv_vi = InstructionType::opivi | OPCODE(0x17), + vmv_vv = InstructionType::opivv | OPCODE(0x17), + vmv_vx = InstructionType::opivx | OPCODE(0x17), + vmv_xs = InstructionType::opmvv | OPCODE(0x10), + vor_vv = InstructionType::opivv | OPCODE(0xA), + vredmaxu_vs = InstructionType::opmvv | OPCODE(0x6), + vredminu_vs = InstructionType::opmvv | OPCODE(0x4), + vredsum_vs = InstructionType::opmvv | OPCODE(0x0), + vrgather_vv = InstructionType::opivv | OPCODE(0xC), + vrsub_vi = InstructionType::opivi | OPCODE(0x3), + vsadd_vv = InstructionType::opivv | OPCODE(0x21), + vsaddu_vv = InstructionType::opivv | OPCODE(0x20), + vsext_vf2 = InstructionType::opmvv | OPCODE(0x12) | (0x7 << 15), + vslidedown_vi = InstructionType::opivi | OPCODE(0xF), + vsll_vi = InstructionType::opivi | OPCODE(0x25), + vsll_vx = InstructionType::opivx | OPCODE(0x25), + vsra_vi = InstructionType::opivi | OPCODE(0x29), + vsra_vx = InstructionType::opivx | OPCODE(0x29), + vsrl_vi = InstructionType::opivi | OPCODE(0x28), + vsrl_vx = InstructionType::opivx | OPCODE(0x28), + vssub_vv = InstructionType::opivv | OPCODE(0x23), + vssubu_vv = InstructionType::opivv | OPCODE(0x22), + vsub_vv = InstructionType::opivv | OPCODE(0x2), + vwmul_vv = InstructionType::opmvv | OPCODE(0x3B), + vxor_vi = InstructionType::opivi | OPCODE(0xB), + vxor_vv = InstructionType::opivv | OPCODE(0xB), + vzext_vf2 = InstructionType::opmvv | OPCODE(0x12) | (0x6 << 15), +}; + +enum OperandTypes : uint32_t { + rnIsImm = 1 << 1, + rmIsImm = 1 << 2, + rnIsGpr = 1 << 3, + rmIsGpr = 1 << 4, + rdIsGpr = 1 << 5 +}; +}; // namespace SimdOp + +static void simdEmitVsetivli(struct sljit_compiler* compiler, sljit_s32 type, sljit_ins vlmul) +{ + uint32_t elem_size = (uint32_t)(((type) >> 18) & 0x3f); + uint32_t avl = (uint32_t)1 << (4 - elem_size); + + uint32_t opcode = VSETIVLI | (6 << 7) | (elem_size << 23) | (vlmul << 20) | (avl << 15); + sljit_emit_op_custom(compiler, &opcode, sizeof(uint32_t)); +} + +static void simdEmitOp(sljit_compiler* compiler, uint32_t opcode, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, uint32_t optype = 0) +{ + rd = sljit_get_register_index((optype & SimdOp::rdIsGpr) ? SLJIT_GP_REGISTER : SLJIT_SIMD_REG_128, rd); + if (!(optype & SimdOp::rnIsImm) && !(optype & SimdOp::rnIsGpr)) { + ASSERT(rn >= SLJIT_VR0); + rn = sljit_get_register_index(SLJIT_SIMD_REG_128, rn); + } + if (optype & SimdOp::rnIsGpr) { + ASSERT(rn >= SLJIT_R0); + rn = sljit_get_register_index(SLJIT_GP_REGISTER, rn); + } + if (!(optype & SimdOp::rmIsImm) && !(optype & SimdOp::rmIsGpr)) { + ASSERT(rm >= SLJIT_VR0); + rm = sljit_get_register_index(SLJIT_SIMD_REG_128, rm); + } + if (optype & SimdOp::rmIsGpr) { + ASSERT(rm >= SLJIT_R0); + rm = sljit_get_register_index(SLJIT_GP_REGISTER, rm); + } + + opcode |= ((uint32_t)rd << 7) | ((uint32_t)rm << 15) | ((uint32_t)rn << 20); + sljit_emit_op_custom(compiler, &opcode, sizeof(uint32_t)); +} + +static void simdEmitTypedOp(sljit_compiler* compiler, sljit_s32 type, uint32_t opcode, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, uint32_t optype = 0, sljit_ins vlmul = 0) +{ + simdEmitVsetivli(compiler, type, vlmul); + simdEmitOp(compiler, opcode, rd, rn, rm, optype); +} + +static void simdEmitCSRRWI(sljit_compiler* compiler, sljit_s32 rd, uint32_t csr, uint32_t uimm) +{ + rd = sljit_get_register_index(SLJIT_GP_REGISTER, rd); + uint32_t opcode = 0x73 | ((uint32_t)rd << 7) | (0x5 << 12) | (csr << 20) | (uimm << 15); + sljit_emit_op_custom(compiler, &opcode, sizeof(uint32_t)); +} + +static void simdEmitCSRRW(sljit_compiler* compiler, sljit_s32 rd, uint32_t csr, sljit_s32 rs1) +{ + rd = sljit_get_register_index(SLJIT_GP_REGISTER, rd); + rs1 = sljit_get_register_index(SLJIT_GP_REGISTER, rs1); + uint32_t opcode = 0x73 | ((uint32_t)rd << 7) | ((uint32_t)rs1 << 15) | (0x1 << 12) | (csr << 20); + sljit_emit_op_custom(compiler, &opcode, sizeof(uint32_t)); +} + +static void simdEmitAbs(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn) +{ + sljit_s32 mask = SLJIT_VR0; + sljit_s32 tmp = rd == rn ? SLJIT_TMP_DEST_VREG : rd; + + simdEmitTypedOp(compiler, type, SimdOp::vmv_vv, tmp, 0, rn, SimdOp::rnIsImm); + simdEmitOp(compiler, SimdOp::vmslt_vx, mask, rn, TMP_ZERO); + simdEmitOp(compiler, SimdOp::vrsub_vi ^ SimdOp::vm, tmp, rn, 0, SimdOp::rmIsImm); + + if (rd == rn) { + simdEmitOp(compiler, SimdOp::vmv_vv, rd, 0, tmp); + } +} + +static void simdEmitAllTrue(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn) +{ + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + simdEmitTypedOp(compiler, type, SimdOp::vmv_vi, tmp, 0, (0x1F), SimdOp::rmIsImm); + simdEmitOp(compiler, SimdOp::vredminu_vs, tmp, rn, tmp); + simdEmitOp(compiler, SimdOp::vmv_xs, rd, 0, tmp, SimdOp::rnIsImm | SimdOp::rdIsGpr); + struct sljit_jump* notAllTrue = sljit_emit_cmp(compiler, SLJIT_EQUAL, rd, 0, SLJIT_IMM, 0); + sljit_emit_op1(compiler, SLJIT_MOV, rd, 0, SLJIT_IMM, 1); + sljit_set_label(notAllTrue, sljit_emit_label(compiler)); +} + +static void simdEmitAnyTrue(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn) +{ + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + simdEmitTypedOp(compiler, type, SimdOp::vmv_sx, tmp, 0, TMP_ZERO, SimdOp::rmIsGpr); + simdEmitOp(compiler, SimdOp::vredmaxu_vs, tmp, rn, tmp); + simdEmitOp(compiler, SimdOp::vmv_xs, rd, 0, tmp, SimdOp::rnIsImm | SimdOp::rdIsGpr); + struct sljit_jump* notAnyTrue = sljit_emit_cmp(compiler, SLJIT_EQUAL, rd, 0, SLJIT_IMM, 0); + sljit_emit_op1(compiler, SLJIT_MOV, rd, 0, SLJIT_IMM, 1); + sljit_set_label(notAnyTrue, sljit_emit_label(compiler)); +} + +static void simdEmitAvgr(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm) +{ + sljit_s32 gptmp = SLJIT_TMP_DEST_REG; + simdEmitCSRRWI(compiler, gptmp, 0x00A, 0); + simdEmitTypedOp(compiler, type, SimdOp::vaaddu_vv, rd, rn, rm); + simdEmitCSRRW(compiler, gptmp, 0x00A, gptmp); +} + +static void simdEmitCompare(sljit_compiler* compiler, sljit_s32 type, uint32_t opcode, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, bool reverseMask = false) +{ + sljit_s32 mask = SLJIT_VR0; + simdEmitTypedOp(compiler, type, opcode, mask, rn, rm); + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + simdEmitOp(compiler, SimdOp::vmv_vi, tmp, 0, reverseMask ? (0x1F) : 0, SimdOp::rmIsImm | SimdOp::rnIsImm); + simdEmitOp(compiler, SimdOp::vmerge_vi, rd, tmp, reverseMask ? 0 : (0x1F), SimdOp::rmIsImm); +} + +static void simdEmitExtend(sljit_compiler* compiler, sljit_s32 type, bool low, bool isSigned, sljit_s32 rd, sljit_s32 rn) +{ + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + + if (low) { + simdEmitTypedOp(compiler, type, SimdOp::vmv_vv, tmp, 0, rn); + } else { + sljit_s32 t = SLJIT_SIMD_ELEM_8; + uint32_t imm = 8; + switch (type) { + case SLJIT_SIMD_ELEM_32: + t = SLJIT_SIMD_ELEM_16; + imm = 4; + break; + case SLJIT_SIMD_ELEM_64: + t = SLJIT_SIMD_ELEM_32; + imm = 2; + break; + } + simdEmitTypedOp(compiler, t, SimdOp::vslidedown_vi, tmp, rn, imm, SimdOp::rmIsImm); + } + uint32_t opcode = isSigned ? SimdOp::vsext_vf2 : SimdOp::vzext_vf2; + opcode |= (sljit_get_register_index(SLJIT_SIMD_REG_128, rd) << 7) | (sljit_get_register_index(SLJIT_SIMD_REG_128, tmp) << 20); + simdEmitVsetivli(compiler, type, 7); + sljit_emit_op_custom(compiler, &opcode, sizeof(uint32_t)); +} + +static void simdEmitI32x4DotI16x8(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm) +{ + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + simdEmitTypedOp(compiler, SLJIT_SIMD_ELEM_32, SimdOp::vmul_vv, tmp, rn, rm); + simdEmitTypedOp(compiler, SLJIT_SIMD_ELEM_16, SimdOp::vredsum_vs, rd, tmp, rn); +} + +static void simdEmitFCeil(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn) +{ + const int floatMantissaBits = type == (SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64) ? 52 : 23; + const int floatExponentBits = type == (SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64) ? 11 : 8; + const int floatExponentBias = type == (SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64) ? 1023 : 127; + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + sljit_s32 gptmp = SLJIT_TMP_DEST_REG; + sljit_s32 ftmp = SLJIT_TMP_DEST_FREG; + sljit_s32 mask = SLJIT_VR0; + simdEmitTypedOp(compiler, type, SimdOp::vmv_vi, tmp, 0, 0, SimdOp::rmIsImm | SimdOp::rnIsImm); + sljit_emit_op1(compiler, SLJIT_MOV, gptmp, 0, SLJIT_IMM, 64 - floatMantissaBits - floatExponentBits); + simdEmitOp(compiler, SimdOp::vsll_vx, tmp, rn, gptmp, SimdOp::rmIsGpr); + sljit_emit_op1(compiler, SLJIT_MOV, gptmp, 0, SLJIT_IMM, 64 - floatExponentBias); + simdEmitOp(compiler, SimdOp::vsrl_vx, tmp, tmp, gptmp, SimdOp::rmIsGpr); + sljit_emit_op1(compiler, SLJIT_MOV, gptmp, 0, SLJIT_IMM, floatExponentBias + floatMantissaBits); + simdEmitOp(compiler, SimdOp::vmslt_vx, mask, tmp, gptmp, SimdOp::rmIsGpr); + simdEmitCSRRWI(compiler, gptmp, 0x00A, 0x3); + simdEmitOp(compiler, SimdOp::vmv_vv, rd, 0, rn); + if (rd == rn) { + simdEmitOp(compiler, SimdOp::vmv_vv, tmp, 0, rn); + } + simdEmitOp(compiler, SimdOp::vfcvt_x_f_v ^ SimdOp::vm, rd, rn, 0, SimdOp::rmIsImm); + simdEmitOp(compiler, SimdOp::vfcvt_f_x_v ^ SimdOp::vm, rd, rd, 0, SimdOp::rmIsImm); + if (rd == rn) { + simdEmitOp(compiler, SimdOp::vfsgnj_vv, rd, rd, tmp); + } else { + simdEmitOp(compiler, SimdOp::vfsgnj_vv, rd, rd, rn); + } + simdEmitOp(compiler, SimdOp::vmfeq_vv, mask, rn, rn); + simdEmitOp(compiler, SimdOp::vxor_vi, mask, mask, (0x1F), SimdOp::rmIsImm); + uint32_t opC = (sljit_get_register_index(SLJIT_FLOAT_REGISTER, ftmp) << 7) | (TMP_ZERO << 15); + if (type == (SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32)) { + opC |= 0x53 | (0x70 << 25); + } else { +#if (defined SLJIT_CONFIG_RISCV64 && SLJIT_CONFIG_RISCV64) + opC |= 0x53 | (0x79 << 25); +#else + opC |= 0x53 | (0x69 << 25); +#endif + } + sljit_emit_op_custom(compiler, &opC, sizeof(uint32_t)); + simdEmitOp(compiler, SimdOp::vfadd_vf ^ SimdOp::vm, rd, rn, ftmp); +} + +static void simdEmitFMinMax(sljit_compiler* compiler, sljit_s32 type, sljit_s32 opcode, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm) +{ + sljit_s32 mask = SLJIT_VR0; + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + sljit_s32 gptmp = SLJIT_TMP_DEST_REG; + simdEmitTypedOp(compiler, type, SimdOp::vmfeq_vv, mask, rn, rn); + simdEmitOp(compiler, SimdOp::vmfeq_vv, tmp, rm, rm); + simdEmitOp(compiler, SimdOp::vand_vv, mask, mask, tmp); + sljit_emit_op1(compiler, SLJIT_MOV, gptmp, 0, SLJIT_IMM, type == (SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64) ? 0x7FF8000000000000 : 0x7FC00000U); + simdEmitOp(compiler, SimdOp::vmv_vx, tmp, 0, gptmp, SimdOp::rmIsGpr | SimdOp::rnIsImm); + simdEmitOp(compiler, opcode ^ SimdOp::vm, tmp, rn, rm); + simdEmitOp(compiler, SimdOp::vmv_vv, rd, 0, tmp, SimdOp::rnIsImm); +} + +static void SimdEmitFTrunc(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn) +{ + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + simdEmitTypedOp(compiler, type, SimdOp::vfcvt_x_f_v, tmp, rn, 0, SimdOp::rmIsImm); + simdEmitTypedOp(compiler, type, SimdOp::vfcvt_f_x_v, rd, tmp, 0, SimdOp::rmIsImm); +} + +static void simdEmitPMinMax(sljit_compiler* compiler, sljit_s32 type, bool min, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm) +{ + sljit_s32 mask = SLJIT_VR0; + simdEmitTypedOp(compiler, type, SimdOp::vmflt_vv, mask, min ? rm : rn, min ? rn : rm); + simdEmitOp(compiler, SimdOp::vmerge_vv, rd, rn, rm); +} + +static void simdEmitPopcnt(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn, sljit_s32 rt) +{ +#if defined(__riscv_zvbb) + simdEmitTypedOp(compiler, type, SimdOp::vcpop_v, rd, rn, 0, SimdOp::rmIsImm); +#else + sljit_s32 mask = SLJIT_VR0; + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + sljit_s32 tmpgp = SLJIT_TMP_DEST_REG; + simdEmitTypedOp(compiler, type, SimdOp::vmv_vv, tmp, 0, rn); + simdEmitOp(compiler, SimdOp::vmv_vi, rd, 0, 0, SimdOp::rmIsImm); + struct sljit_label* label = sljit_emit_label(compiler); + simdEmitOp(compiler, SimdOp::vmsne_vi, mask, tmp, 0); + simdEmitOp(compiler, SimdOp::vadd_vi ^ SimdOp::vm, rd, rd, 1, SimdOp::rmIsImm); + simdEmitOp(compiler, SimdOp::vadd_vi ^ SimdOp::vm, rt, tmp, (0x1F), SimdOp::rmIsImm); + simdEmitOp(compiler, SimdOp::vand_vv, tmp, tmp, rt); + uint32_t opcode = SimdOp::vfirst_m | sljit_get_register_index(SLJIT_GP_REGISTER, tmpgp) << 7 | sljit_get_register_index(SLJIT_SIMD_REG_128, tmp) << 20; + sljit_emit_op_custom(compiler, &opcode, sizeof(uint32_t)); + struct sljit_jump* jump = sljit_emit_cmp(compiler, SLJIT_SIG_LESS, tmpgp, 0, SLJIT_IMM, 0); + sljit_set_label(sljit_emit_jump(compiler, SLJIT_JUMP), label); + sljit_set_label(jump, sljit_emit_label(compiler)); +#endif +} + +static void simdEmitSwizzle(sljit_compiler* compiler, sljit_s32 type, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm) +{ + sljit_s32 tmp = SLJIT_TMP_DEST_VREG; + if (rd == rn) { + simdEmitTypedOp(compiler, type, SimdOp::vrgather_vv, tmp, rn, rm); + simdEmitOp(compiler, SimdOp::vmv_vv, rd, 0, tmp); + } else { + simdEmitTypedOp(compiler, type, SimdOp::vrgather_vv, rd, rn, rm); + } +} + +static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) +{ + Operand* operands = instr->operands(); + JITArg args[2]; + + sljit_s32 srcType = SLJIT_SIMD_ELEM_128; + sljit_s32 dstType = SLJIT_SIMD_ELEM_128; + + switch (instr->opcode()) { + case ByteCode::I8X16NegOpcode: + case ByteCode::I8X16AbsOpcode: + case ByteCode::I8X16PopcntOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_8; + break; + case ByteCode::I16X8ExtendLowI8X16SOpcode: + case ByteCode::I16X8ExtendHighI8X16SOpcode: + case ByteCode::I16X8ExtendLowI8X16UOpcode: + case ByteCode::I16X8ExtendHighI8X16UOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I16X8NegOpcode: + case ByteCode::I16X8AbsOpcode: + case ByteCode::I16X8ExtaddPairwiseI8X16SOpcode: + case ByteCode::I16X8ExtaddPairwiseI8X16UOpcode: + srcType = SLJIT_SIMD_ELEM_16; + dstType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I32X4NegOpcode: + case ByteCode::I32X4AbsOpcode: + case ByteCode::I32X4ExtaddPairwiseI16X8SOpcode: + case ByteCode::I32X4ExtaddPairwiseI16X8UOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I32X4ExtendLowI16X8SOpcode: + case ByteCode::I32X4ExtendHighI16X8SOpcode: + case ByteCode::I32X4ExtendLowI16X8UOpcode: + case ByteCode::I32X4ExtendHighI16X8UOpcode: + srcType = SLJIT_SIMD_ELEM_16; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I32X4TruncSatF32X4SOpcode: + case ByteCode::I32X4TruncSatF32X4UOpcode: + srcType = SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_FLOAT; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I32X4TruncSatF64X2SZeroOpcode: + case ByteCode::I32X4TruncSatF64X2UZeroOpcode: + srcType = SLJIT_SIMD_ELEM_64 | SLJIT_SIMD_FLOAT; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I64X2NegOpcode: + case ByteCode::I64X2AbsOpcode: + srcType = SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_ELEM_64; + break; + case ByteCode::I64X2ExtendLowI32X4SOpcode: + case ByteCode::I64X2ExtendHighI32X4SOpcode: + case ByteCode::I64X2ExtendLowI32X4UOpcode: + case ByteCode::I64X2ExtendHighI32X4UOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_ELEM_64; + break; + case ByteCode::F32X4AbsOpcode: + case ByteCode::F32X4NegOpcode: + case ByteCode::F32X4SqrtOpcode: + case ByteCode::F32X4CeilOpcode: + case ByteCode::F32X4FloorOpcode: + case ByteCode::F32X4TruncOpcode: + case ByteCode::F32X4NearestOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + break; + case ByteCode::F32X4ConvertI32X4SOpcode: + case ByteCode::F32X4ConvertI32X4UOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + break; + case ByteCode::F32X4DemoteF64X2ZeroOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + break; + case ByteCode::F64X2AbsOpcode: + case ByteCode::F64X2NegOpcode: + case ByteCode::F64X2SqrtOpcode: + case ByteCode::F64X2CeilOpcode: + case ByteCode::F64X2FloorOpcode: + case ByteCode::F64X2TruncOpcode: + case ByteCode::F64X2NearestOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + break; + case ByteCode::F64X2ConvertLowI32X4SOpcode: + case ByteCode::F64X2ConvertLowI32X4UOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + break; + case ByteCode::F64X2PromoteLowF32X4Opcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + break; + case ByteCode::V128NotOpcode: + srcType = SLJIT_SIMD_ELEM_128; + dstType = SLJIT_SIMD_ELEM_128; + break; + default: + ASSERT_NOT_REACHED(); + break; + } + + simdOperandToArg(compiler, operands, args[0], srcType, instr->requiredReg(0)); + + args[1].set(operands + 1); + sljit_s32 dst = GET_TARGET_REG(args[1].arg, instr->requiredReg(0)); + + switch (instr->opcode()) { + case ByteCode::F32X4DemoteF64X2ZeroOpcode: + break; + case ByteCode::F32X4ConvertI32X4SOpcode: + break; + case ByteCode::F32X4ConvertI32X4UOpcode: + break; + case ByteCode::F64X2ConvertLowI32X4SOpcode: + break; + case ByteCode::F64X2ConvertLowI32X4UOpcode: + break; + case ByteCode::V128NotOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vxor_vv, dst, args[0].arg, (0x1F), SimdOp::rmIsImm); + break; + case ByteCode::I8X16NegOpcode: + case ByteCode::I16X8NegOpcode: + case ByteCode::I32X4NegOpcode: + case ByteCode::I64X2NegOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vrsub_vi, dst, args[0].arg, 0); + break; + case ByteCode::I8X16AbsOpcode: + case ByteCode::I16X8AbsOpcode: + case ByteCode::I32X4AbsOpcode: + case ByteCode::I64X2AbsOpcode: + simdEmitAbs(compiler, srcType, dst, args[0].arg); + break; + case ByteCode::I8X16PopcntOpcode: + simdEmitPopcnt(compiler, srcType, dst, args[0].arg, instr->requiredReg(1)); + break; + case ByteCode::I16X8ExtaddPairwiseI8X16SOpcode: + break; + case ByteCode::I16X8ExtaddPairwiseI8X16UOpcode: + break; + case ByteCode::I16X8ExtendLowI8X16SOpcode: + case ByteCode::I32X4ExtendLowI16X8SOpcode: + case ByteCode::I64X2ExtendLowI32X4SOpcode: + simdEmitExtend(compiler, srcType, true, true, dst, args[0].arg); + break; + case ByteCode::I16X8ExtendHighI8X16SOpcode: + case ByteCode::I32X4ExtendHighI16X8SOpcode: + case ByteCode::I64X2ExtendHighI32X4SOpcode: + simdEmitExtend(compiler, srcType, false, true, dst, args[0].arg); + break; + case ByteCode::I16X8ExtendLowI8X16UOpcode: + case ByteCode::I32X4ExtendLowI16X8UOpcode: + case ByteCode::I64X2ExtendLowI32X4UOpcode: + simdEmitExtend(compiler, srcType, true, false, dst, args[0].arg); + break; + case ByteCode::I16X8ExtendHighI8X16UOpcode: + case ByteCode::I32X4ExtendHighI16X8UOpcode: + case ByteCode::I64X2ExtendHighI32X4UOpcode: + simdEmitExtend(compiler, srcType, false, false, dst, args[0].arg); + break; + case ByteCode::I32X4ExtaddPairwiseI16X8SOpcode: + break; + case ByteCode::I32X4ExtaddPairwiseI16X8UOpcode: + break; + case ByteCode::I32X4TruncSatF32X4SOpcode: + break; + case ByteCode::I32X4TruncSatF32X4UOpcode: + break; + case ByteCode::I32X4TruncSatF64X2SZeroOpcode: + break; + case ByteCode::I32X4TruncSatF64X2UZeroOpcode: + break; + case ByteCode::F32X4AbsOpcode: + case ByteCode::F64X2AbsOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vfsgnjx_vv, dst, args[0].arg, args[0].arg); + break; + case ByteCode::F32X4CeilOpcode: + case ByteCode::F64X2CeilOpcode: + simdEmitFCeil(compiler, srcType, dst, args[0].arg); + break; + case ByteCode::F32X4FloorOpcode: + case ByteCode::F64X2FloorOpcode: + break; + case ByteCode::F32X4NearestOpcode: + case ByteCode::F64X2NearestOpcode: + break; + case ByteCode::F32X4NegOpcode: + case ByteCode::F64X2NegOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vfsgnjn_vv, dst, args[0].arg, args[0].arg); + break; + case ByteCode::F32X4SqrtOpcode: + case ByteCode::F64X2SqrtOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vfsqrt_v, dst, args[0].arg, 0); + break; + case ByteCode::F32X4TruncOpcode: + case ByteCode::F64X2TruncOpcode: + SimdEmitFTrunc(compiler, srcType, dst, args[0].arg); + break; + case ByteCode::F64X2PromoteLowF32X4Opcode: + break; + default: + ASSERT_NOT_REACHED(); + break; + } + + if (SLJIT_IS_MEM(args[1].arg)) { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | dstType, dst, args[1].arg, args[1].argw); + } +} + +static bool emitUnaryCondSIMD(sljit_compiler* compiler, Instruction* instr) +{ + Operand* operands = instr->operands(); + JITArg args[2]; + + sljit_s32 srcType = SLJIT_SIMD_ELEM_128; + sljit_s32 type = SLJIT_NOT_EQUAL; + + switch (instr->opcode()) { + case ByteCode::I8X16AllTrueOpcode: + srcType = SLJIT_SIMD_ELEM_8; + break; + case ByteCode::I16X8AllTrueOpcode: + srcType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I32X4AllTrueOpcode: + srcType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I64X2AllTrueOpcode: + srcType = SLJIT_SIMD_ELEM_64; + type = SLJIT_ORDERED_EQUAL; + break; + default: + ASSERT(instr->opcode() == ByteCode::V128AnyTrueOpcode); + srcType = SLJIT_SIMD_ELEM_8; + break; + } + + simdOperandToArg(compiler, operands, args[0], srcType, instr->requiredReg(0)); + + sljit_s32 dst = SLJIT_TMP_DEST_REG; + + if (!(instr->info() & Instruction::kIsMergeCompare)) { + args[1].set(operands + 1); + dst = GET_TARGET_REG(args[1].arg, SLJIT_TMP_DEST_REG); + } + + switch (instr->opcode()) { + case ByteCode::I8X16AllTrueOpcode: + case ByteCode::I16X8AllTrueOpcode: + case ByteCode::I32X4AllTrueOpcode: + case ByteCode::I64X2AllTrueOpcode: + simdEmitAllTrue(compiler, srcType, dst, args[0].arg); + break; + default: + ASSERT(instr->opcode() == ByteCode::V128AnyTrueOpcode); + simdEmitAnyTrue(compiler, srcType, dst, args[0].arg); + break; + } + + ASSERT(instr->next() != nullptr); + + if (instr->info() & Instruction::kIsMergeCompare) { + return true; + } + + return false; +} + +static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) +{ + Operand* operands = instr->operands(); + JITArg args[3]; + + sljit_s32 srcType = SLJIT_SIMD_ELEM_128; + sljit_s32 dstType = SLJIT_SIMD_ELEM_128; + + switch (instr->opcode()) { + case ByteCode::I8X16AddOpcode: + case ByteCode::I8X16SubOpcode: + case ByteCode::I8X16AddSatSOpcode: + case ByteCode::I8X16AddSatUOpcode: + case ByteCode::I8X16SubSatSOpcode: + case ByteCode::I8X16SubSatUOpcode: + case ByteCode::I8X16EqOpcode: + case ByteCode::I8X16NeOpcode: + case ByteCode::I8X16LtSOpcode: + case ByteCode::I8X16LtUOpcode: + case ByteCode::I8X16LeSOpcode: + case ByteCode::I8X16LeUOpcode: + case ByteCode::I8X16GtSOpcode: + case ByteCode::I8X16GtUOpcode: + case ByteCode::I8X16GeSOpcode: + case ByteCode::I8X16GeUOpcode: + case ByteCode::I8X16MinSOpcode: + case ByteCode::I8X16MinUOpcode: + case ByteCode::I8X16MaxSOpcode: + case ByteCode::I8X16MaxUOpcode: + case ByteCode::I8X16AvgrUOpcode: + case ByteCode::I8X16SwizzleOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_8; + break; + case ByteCode::I8X16NarrowI16X8SOpcode: + case ByteCode::I8X16NarrowI16X8UOpcode: + srcType = SLJIT_SIMD_ELEM_16; + dstType = SLJIT_SIMD_ELEM_8; + break; + case ByteCode::I16X8AddOpcode: + case ByteCode::I16X8SubOpcode: + case ByteCode::I16X8MulOpcode: + case ByteCode::I16X8AddSatSOpcode: + case ByteCode::I16X8AddSatUOpcode: + case ByteCode::I16X8SubSatSOpcode: + case ByteCode::I16X8SubSatUOpcode: + case ByteCode::I16X8EqOpcode: + case ByteCode::I16X8NeOpcode: + case ByteCode::I16X8LtSOpcode: + case ByteCode::I16X8LtUOpcode: + case ByteCode::I16X8LeSOpcode: + case ByteCode::I16X8LeUOpcode: + case ByteCode::I16X8GtSOpcode: + case ByteCode::I16X8GtUOpcode: + case ByteCode::I16X8GeSOpcode: + case ByteCode::I16X8GeUOpcode: + case ByteCode::I16X8MinSOpcode: + case ByteCode::I16X8MinUOpcode: + case ByteCode::I16X8MaxSOpcode: + case ByteCode::I16X8MaxUOpcode: + case ByteCode::I16X8AvgrUOpcode: + case ByteCode::I16X8Q15mulrSatSOpcode: + srcType = SLJIT_SIMD_ELEM_16; + dstType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I16X8ExtmulLowI8X16SOpcode: + case ByteCode::I16X8ExtmulHighI8X16SOpcode: + case ByteCode::I16X8ExtmulLowI8X16UOpcode: + case ByteCode::I16X8ExtmulHighI8X16UOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I16X8NarrowI32X4SOpcode: + case ByteCode::I16X8NarrowI32X4UOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I32X4AddOpcode: + case ByteCode::I32X4SubOpcode: + case ByteCode::I32X4MulOpcode: + case ByteCode::I32X4EqOpcode: + case ByteCode::I32X4NeOpcode: + case ByteCode::I32X4LtSOpcode: + case ByteCode::I32X4LtUOpcode: + case ByteCode::I32X4LeSOpcode: + case ByteCode::I32X4LeUOpcode: + case ByteCode::I32X4GtSOpcode: + case ByteCode::I32X4GtUOpcode: + case ByteCode::I32X4GeSOpcode: + case ByteCode::I32X4GeUOpcode: + case ByteCode::I32X4MinSOpcode: + case ByteCode::I32X4MinUOpcode: + case ByteCode::I32X4MaxSOpcode: + case ByteCode::I32X4MaxUOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I32X4ExtmulLowI16X8SOpcode: + case ByteCode::I32X4ExtmulHighI16X8SOpcode: + case ByteCode::I32X4ExtmulLowI16X8UOpcode: + case ByteCode::I32X4ExtmulHighI16X8UOpcode: + case ByteCode::I32X4DotI16X8SOpcode: + srcType = SLJIT_SIMD_ELEM_16; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I64X2AddOpcode: + case ByteCode::I64X2SubOpcode: + case ByteCode::I64X2MulOpcode: + case ByteCode::I64X2EqOpcode: + case ByteCode::I64X2NeOpcode: + case ByteCode::I64X2LtSOpcode: + case ByteCode::I64X2LeSOpcode: + case ByteCode::I64X2GtSOpcode: + case ByteCode::I64X2GeSOpcode: + srcType = SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_ELEM_64; + break; + case ByteCode::I64X2ExtmulLowI32X4SOpcode: + case ByteCode::I64X2ExtmulHighI32X4SOpcode: + case ByteCode::I64X2ExtmulLowI32X4UOpcode: + case ByteCode::I64X2ExtmulHighI32X4UOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_ELEM_64; + break; + case ByteCode::F32X4AddOpcode: + case ByteCode::F32X4SubOpcode: + case ByteCode::F32X4MulOpcode: + case ByteCode::F32X4DivOpcode: + case ByteCode::F32X4EqOpcode: + case ByteCode::F32X4NeOpcode: + case ByteCode::F32X4LtOpcode: + case ByteCode::F32X4LeOpcode: + case ByteCode::F32X4GtOpcode: + case ByteCode::F32X4GeOpcode: + case ByteCode::F32X4PMinOpcode: + case ByteCode::F32X4PMaxOpcode: + case ByteCode::F32X4MaxOpcode: + case ByteCode::F32X4MinOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + break; + case ByteCode::F64X2AddOpcode: + case ByteCode::F64X2SubOpcode: + case ByteCode::F64X2MulOpcode: + case ByteCode::F64X2DivOpcode: + case ByteCode::F64X2SqrtOpcode: + case ByteCode::F64X2EqOpcode: + case ByteCode::F64X2NeOpcode: + case ByteCode::F64X2LtOpcode: + case ByteCode::F64X2LeOpcode: + case ByteCode::F64X2GtOpcode: + case ByteCode::F64X2GeOpcode: + case ByteCode::F64X2PMinOpcode: + case ByteCode::F64X2PMaxOpcode: + case ByteCode::F64X2MaxOpcode: + case ByteCode::F64X2MinOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + break; + case ByteCode::V128AndOpcode: + case ByteCode::V128OrOpcode: + case ByteCode::V128XorOpcode: + case ByteCode::V128AndnotOpcode: + srcType = SLJIT_SIMD_ELEM_128; + dstType = SLJIT_SIMD_ELEM_128; + break; + default: + ASSERT_NOT_REACHED(); + break; + } + + simdOperandToArg(compiler, operands, args[0], srcType, instr->requiredReg(0)); + simdOperandToArg(compiler, operands + 1, args[1], srcType, instr->requiredReg(1)); + + args[2].set(operands + 2); + sljit_s32 dst = GET_TARGET_REG(args[2].arg, instr->requiredReg(2)); + + switch (instr->opcode()) { + case ByteCode::I8X16AddOpcode: + case ByteCode::I16X8AddOpcode: + case ByteCode::I32X4AddOpcode: + case ByteCode::I64X2AddOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vadd_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16SubOpcode: + case ByteCode::I16X8SubOpcode: + case ByteCode::I32X4SubOpcode: + case ByteCode::I64X2SubOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vsub_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16AddSatSOpcode: + case ByteCode::I16X8AddSatSOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vsadd_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16AddSatUOpcode: + case ByteCode::I16X8AddSatUOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vsaddu_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16SubSatSOpcode: + case ByteCode::I16X8SubSatSOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vssub_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16SubSatUOpcode: + case ByteCode::I16X8SubSatUOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vssubu_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16EqOpcode: + case ByteCode::I16X8EqOpcode: + case ByteCode::I32X4EqOpcode: + case ByteCode::I64X2EqOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmseq_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16NeOpcode: + case ByteCode::I16X8NeOpcode: + case ByteCode::I32X4NeOpcode: + case ByteCode::I64X2NeOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmsne_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16LtSOpcode: + case ByteCode::I16X8LtSOpcode: + case ByteCode::I32X4LtSOpcode: + case ByteCode::I64X2LtSOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmslt_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16LtUOpcode: + case ByteCode::I16X8LtUOpcode: + case ByteCode::I32X4LtUOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmsltu_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16LeSOpcode: + case ByteCode::I16X8LeSOpcode: + case ByteCode::I32X4LeSOpcode: + case ByteCode::I64X2LeSOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmsle_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16LeUOpcode: + case ByteCode::I16X8LeUOpcode: + case ByteCode::I32X4LeUOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmsleu_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16GtSOpcode: + case ByteCode::I16X8GtSOpcode: + case ByteCode::I32X4GtSOpcode: + case ByteCode::I64X2GtSOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmsle_vv, dst, args[0].arg, args[1].arg, true); + break; + case ByteCode::I8X16GtUOpcode: + case ByteCode::I16X8GtUOpcode: + case ByteCode::I32X4GtUOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmsleu_vv, dst, args[0].arg, args[1].arg, true); + break; + case ByteCode::I8X16GeSOpcode: + case ByteCode::I16X8GeSOpcode: + case ByteCode::I32X4GeSOpcode: + case ByteCode::I64X2GeSOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmslt_vv, dst, args[0].arg, args[1].arg, true); + break; + case ByteCode::I8X16GeUOpcode: + case ByteCode::I16X8GeUOpcode: + case ByteCode::I32X4GeUOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmsltu_vv, dst, args[0].arg, args[1].arg, true); + break; + case ByteCode::I8X16MinSOpcode: + case ByteCode::I16X8MinSOpcode: + case ByteCode::I32X4MinSOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vmin_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16MinUOpcode: + case ByteCode::I16X8MinUOpcode: + case ByteCode::I32X4MinUOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vminu_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16MaxSOpcode: + case ByteCode::I16X8MaxSOpcode: + case ByteCode::I32X4MaxSOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vmax_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16MaxUOpcode: + case ByteCode::I16X8MaxUOpcode: + case ByteCode::I32X4MaxUOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vmaxu_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16AvgrUOpcode: + case ByteCode::I16X8AvgrUOpcode: + simdEmitAvgr(compiler, srcType, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I8X16NarrowI16X8SOpcode: + break; + case ByteCode::I8X16NarrowI16X8UOpcode: + break; + case ByteCode::I16X8MulOpcode: + case ByteCode::I32X4MulOpcode: + case ByteCode::I64X2MulOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vmul_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I16X8ExtmulLowI8X16SOpcode: + break; + case ByteCode::I16X8ExtmulHighI8X16SOpcode: + break; + case ByteCode::I16X8ExtmulLowI8X16UOpcode: + break; + case ByteCode::I16X8ExtmulHighI8X16UOpcode: + break; + case ByteCode::I16X8NarrowI32X4SOpcode: + break; + case ByteCode::I16X8NarrowI32X4UOpcode: + break; + case ByteCode::I16X8Q15mulrSatSOpcode: + break; + case ByteCode::I32X4ExtmulLowI16X8SOpcode: + break; + case ByteCode::I32X4ExtmulHighI16X8SOpcode: + break; + case ByteCode::I32X4ExtmulLowI16X8UOpcode: + break; + case ByteCode::I32X4ExtmulHighI16X8UOpcode: + break; + case ByteCode::I32X4DotI16X8SOpcode: + simdEmitI32x4DotI16x8(compiler, srcType, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4AddOpcode: + case ByteCode::F64X2AddOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vfadd_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4SubOpcode: + case ByteCode::F64X2SubOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vfsub_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4MulOpcode: + case ByteCode::F64X2MulOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vfmul_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4DivOpcode: + case ByteCode::F64X2DivOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vfdiv_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4MaxOpcode: + case ByteCode::F64X2MaxOpcode: + simdEmitFMinMax(compiler, srcType, SimdOp::vfmax_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4MinOpcode: + case ByteCode::F64X2MinOpcode: + simdEmitFMinMax(compiler, srcType, SimdOp::vfmin_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4PMinOpcode: + case ByteCode::F64X2PMinOpcode: + simdEmitPMinMax(compiler, srcType, true, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4PMaxOpcode: + case ByteCode::F64X2PMaxOpcode: + simdEmitPMinMax(compiler, srcType, false, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4EqOpcode: + case ByteCode::F64X2EqOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmfeq_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4NeOpcode: + case ByteCode::F64X2NeOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmfne_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4LtOpcode: + case ByteCode::F64X2LtOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmflt_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4LeOpcode: + case ByteCode::F64X2LeOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmfle_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4GtOpcode: + case ByteCode::F64X2GtOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmflt_vv, dst, args[1].arg, args[0].arg); + break; + case ByteCode::F32X4GeOpcode: + case ByteCode::F64X2GeOpcode: + simdEmitCompare(compiler, srcType, SimdOp::vmfle_vv, dst, args[1].arg, args[0].arg); + break; + case ByteCode::I64X2ExtmulLowI32X4SOpcode: + break; + case ByteCode::I64X2ExtmulHighI32X4SOpcode: + break; + case ByteCode::I64X2ExtmulLowI32X4UOpcode: + break; + case ByteCode::I64X2ExtmulHighI32X4UOpcode: + break; + case ByteCode::V128AndOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vand_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::V128OrOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vor_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::V128XorOpcode: + simdEmitTypedOp(compiler, srcType, SimdOp::vxor_vv, dst, args[0].arg, args[1].arg); + break; + case ByteCode::V128AndnotOpcode: + break; + case ByteCode::I8X16SwizzleOpcode: + simdEmitSwizzle(compiler, srcType, dst, args[0].arg, args[1].arg); + break; + default: + ASSERT_NOT_REACHED(); + break; + } + + if (SLJIT_IS_MEM(args[2].arg)) { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | dstType, dst, args[2].arg, args[2].argw); + } +} + +static void emitTernarySIMD(sljit_compiler* compiler, Instruction* instr) +{ + Operand* operands = instr->operands(); + JITArg args[4]; + + sljit_s32 srcType = SLJIT_SIMD_ELEM_128; + sljit_s32 dstType = SLJIT_SIMD_ELEM_128; + + switch (instr->opcode()) { + case ByteCode::V128BitSelectOpcode: + srcType = SLJIT_SIMD_ELEM_128; + dstType = SLJIT_SIMD_ELEM_128; + break; + case ByteCode::I8X16RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_8; + break; + case ByteCode::I16X8RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_16; + dstType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I32X4RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I64X2RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_ELEM_64; + break; + case ByteCode::I32X4DotI8X16I7X16AddSOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::F32X4RelaxedMaddOpcode: + case ByteCode::F32X4RelaxedNmaddOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + break; + case ByteCode::F64X2RelaxedMaddOpcode: + case ByteCode::F64X2RelaxedNmaddOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + break; + default: + ASSERT_NOT_REACHED(); + break; + } + + simdOperandToArg(compiler, operands, args[0], srcType, instr->requiredReg(0)); + simdOperandToArg(compiler, operands + 1, args[1], srcType, instr->requiredReg(1)); + simdOperandToArg(compiler, operands + 2, args[2], dstType, instr->requiredReg(2)); + + args[3].set(operands + 3); + sljit_s32 dst = GET_TARGET_REG(args[3].arg, instr->requiredReg(2)); + + switch (instr->opcode()) { + case ByteCode::V128BitSelectOpcode: + break; + case ByteCode::I8X16RelaxedLaneSelectOpcode: + break; + case ByteCode::I16X8RelaxedLaneSelectOpcode: + break; + case ByteCode::I32X4RelaxedLaneSelectOpcode: + break; + case ByteCode::I64X2RelaxedLaneSelectOpcode: + break; + case ByteCode::I32X4DotI8X16I7X16AddSOpcode: + break; + case ByteCode::F32X4RelaxedMaddOpcode: + break; + case ByteCode::F32X4RelaxedNmaddOpcode: + break; + case ByteCode::F64X2RelaxedMaddOpcode: + break; + case ByteCode::F64X2RelaxedNmaddOpcode: + break; + default: + ASSERT_NOT_REACHED(); + break; + } + + if (SLJIT_IS_MEM(args[3].arg)) { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | dstType, dst, args[3].arg, args[3].argw); + } +} + +static void emitSelectSIMD(sljit_compiler* compiler, Instruction* instr) +{ + Operand* operands = instr->operands(); + JITArg args[3]; + + simdOperandToArg(compiler, operands, args[0], SLJIT_SIMD_ELEM_128, instr->requiredReg(0)); + simdOperandToArg(compiler, operands + 1, args[1], SLJIT_SIMD_ELEM_128, instr->requiredReg(1)); + simdOperandToArg(compiler, operands + 2, args[2], SLJIT_SIMD_ELEM_128, instr->requiredReg(2)); + + args[2].set(operands + 3); + if (SLJIT_IS_MEM(args[2].arg)) { + } +} + +static void emitShuffleSIMD(sljit_compiler* compiler, Instruction* instr) +{ +} + +static void emitShiftSIMD(sljit_compiler* compiler, Instruction* instr) +{ + Operand* operands = instr->operands(); + JITArg args[3]; + + uint32_t op = 0; + int div = 8; + int mask = 0x1F; + + args[1].set(operands + 1); + + const bool isImm = SLJIT_IS_IMM(args[1].arg); + sljit_s32 type = SLJIT_SIMD_ELEM_8; + + switch (instr->opcode()) { + case ByteCode::I8X16ShlOpcode: + op = isImm ? SimdOp::vsll_vi : SimdOp::vsll_vx; + break; + case ByteCode::I8X16ShrSOpcode: + op = isImm ? SimdOp::vsra_vi : SimdOp::vsra_vx; + break; + case ByteCode::I8X16ShrUOpcode: + op = isImm ? SimdOp::vsrl_vi : SimdOp::vsrl_vx; + break; + case ByteCode::I16X8ShlOpcode: + op = isImm ? SimdOp::vsll_vi : SimdOp::vsll_vx; + div = 16; + type = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I16X8ShrSOpcode: + op = isImm ? SimdOp::vsra_vi : SimdOp::vsra_vx; + div = 16; + type = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I16X8ShrUOpcode: + op = isImm ? SimdOp::vsrl_vi : SimdOp::vsrl_vx; + div = 16; + type = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I32X4ShlOpcode: + op = isImm ? SimdOp::vsll_vi : SimdOp::vsll_vx; + div = 32; + type = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I32X4ShrSOpcode: + op = isImm ? SimdOp::vsra_vi : SimdOp::vsra_vx; + div = 32; + type = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I32X4ShrUOpcode: + op = isImm ? SimdOp::vsrl_vi : SimdOp::vsrl_vx; + div = 32; + type = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I64X2ShlOpcode: + op = isImm ? SimdOp::vsll_vi : SimdOp::vsll_vx; + div = 64; + type = SLJIT_SIMD_ELEM_64; + mask = 0x3F; + break; + case ByteCode::I64X2ShrSOpcode: + op = isImm ? SimdOp::vsra_vi : SimdOp::vsra_vx; + div = 64; + type = SLJIT_SIMD_ELEM_64; + mask = 0x3F; + break; + case ByteCode::I64X2ShrUOpcode: + op = isImm ? SimdOp::vsrl_vi : SimdOp::vsrl_vx; + div = 64; + type = SLJIT_SIMD_ELEM_64; + mask = 0x3F; + break; + default: + ASSERT_NOT_REACHED(); + } + + simdOperandToArg(compiler, operands, args[0], type, instr->requiredReg(0)); + + args[2].set(operands + 2); + sljit_s32 dst = GET_TARGET_REG(args[2].arg, instr->requiredReg(0)); + + if (isImm) { + args[1].argw &= mask; + + if (args[1].argw == 0) { + if (args[2].arg != args[0].arg) { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | type, args[0].arg, args[2].arg, args[2].argw); + } + return; + } + args[1].argw %= div; + } + + simdEmitTypedOp(compiler, type, op, dst, args[0].arg, isImm ? args[1].argw : args[1].arg, isImm ? SimdOp::rmIsImm : SimdOp::rmIsGpr); + + if (SLJIT_IS_MEM(args[2].arg)) { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | type, dst, args[2].arg, args[2].argw); + } +} diff --git a/third_party/sljit b/third_party/sljit index 2c105e246..f7037aa15 160000 --- a/third_party/sljit +++ b/third_party/sljit @@ -1 +1 @@ -Subproject commit 2c105e2461b0d5b6c9c632753522457ca442f9dd +Subproject commit f7037aa1567e0b27c7cf61fbe02f68971e8fa345 diff --git a/tools/run-tests.py b/tools/run-tests.py index 892b04052..499de2d05 100755 --- a/tools/run-tests.py +++ b/tools/run-tests.py @@ -65,7 +65,7 @@ def _run_wast_tests(engine, files, is_fail): fails = 0 for file in files: if jit: - filename = os.path.basename(file) + filename = os.path.basename(file) if filename in JIT_EXCLUDE_FILES: continue