diff --git a/src/dynarec/la64/dynarec_la64_0f.c b/src/dynarec/la64/dynarec_la64_0f.c index 5280c7ff3..1d2d6b03d 100644 --- a/src/dynarec/la64/dynarec_la64_0f.c +++ b/src/dynarec/la64/dynarec_la64_0f.c @@ -652,13 +652,11 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni GETEX(v1, 0, 0); if (!BOX64ENV(dynarec_fastnan) && v0 != v1) { q0 = fpu_get_scratch(dyn); - // always copy from v1 if any oprand is NaN - VFCMP_S(q0, v0, v1, cUN); - VANDN_V(v0, q0, v0); - VAND_V(q0, q0, v1); - VOR_V(v0, v0, q0); + VFCMP_S(q0, v1, v0, cULE); + VBITSEL_V(v0, v0, v1, q0); + } else { + VFMIN_S(v0, v0, v1); } - VFMIN_S(v0, v0, v1); break; case 0x5E: INST_NAME("DIVPS Gx, Ex"); @@ -688,13 +686,14 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni GETEX(v1, 0, 0); if (!BOX64ENV(dynarec_fastnan) && v0 != v1) { q0 = fpu_get_scratch(dyn); - // always copy from v1 if any oprand is NaN - VFCMP_S(q0, v0, v1, cUN); - VANDN_V(v0, q0, v0); - VAND_V(q0, q0, v1); - VOR_V(v0, v0, q0); + q1 = fpu_get_scratch(dyn); + VFCMP_S(q0, v1, v0, cUEQ); // un eq , if either v0/v1=nan ,choose v1. if eq either is ok,but when +0.0 == -0.0 x86 sse choose v1 + VFCMP_S(q1, v0, v1, cLT); + VOR_V(q0, q0, q1); + VBITSEL_V(v0, v0, v1, q0); + } else { + VFMAX_S(v0, v0, v1); } - VFMAX_S(v0, v0, v1); break; case 0x6F: INST_NAME("MOVQ Gm, Em"); diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c index 989f59e52..61f421654 100644 --- a/src/dynarec/la64/dynarec_la64_660f.c +++ b/src/dynarec/la64/dynarec_la64_660f.c @@ -38,7 +38,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int int64_t j64; uint64_t tmp64u, tmp64u2; int v0, v1, v2; - int q0, q1; + int q0, q1, q2; int d0, d1, d2; int64_t fixedaddress, gdoffset; int unscaled; @@ -52,10 +52,10 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int MAYUSE(j64); #if STEP > 1 static const int8_t round_round[] = { - 0xE, // round to nearest with ties to even - 0x2, // round toward minus infinity - 0x6, // round toward plus infinity - 0xA // round toward zero + 0x3, // round to nearest with ties to even + 0x0, // round toward minus infinity + 0x1, // round toward plus infinity + 0x2 // round toward zero }; #endif @@ -382,6 +382,18 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int XVSRLNI_H_W(v0, v0, 1); XVPERMI_D(q0, v0, 0b1000); break; + case 0x10: + INST_NAME("PBLENDVB Gx,Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + v0 = sse_get_reg(dyn, ninst, x1, 0, 0); // XMM0 + v1 = fpu_get_scratch(dyn); + if (q0 != q1) { + VSLTI_B(v1, v0, 0); // bit[7]==1 -> fill with 0xff + VBITSEL_V(q0, q0, q1, v1); + } + break; case 0x14: INST_NAME("BLENDVPS Gx,Ex"); nextop = F8; @@ -394,6 +406,18 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int VBITSEL_V(q0, q0, q1, v1); } break; + case 0x15: + INST_NAME("BLENDVPD Gx,Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + v0 = sse_get_reg(dyn, ninst, x1, 0, 0); // XMM0 + v1 = fpu_get_scratch(dyn); + if (q0 != q1) { + VSLTI_D(v1, v0, 0); + VBITSEL_V(q0, q0, q1, v1); + } + break; case 0x17: INST_NAME("PTEST Gx, Ex"); nextop = F8; @@ -712,6 +736,52 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int case 0x3A: // these are some more SSSE3+ opcodes opcode = F8; switch (opcode) { + case 0x08: + INST_NAME("ROUNDPS Gx, Ex, Ib"); + nextop = F8; + GETEX(q1, 0, 1); + GETGX_empty(q0); + u8 = F8; + v1 = fpu_get_scratch(dyn); + if (u8 & 4) { + u8 = sse_setround(dyn, ninst, x1, x2); + VFRINT_S(q0, q1); + x87_restoreround(dyn, ninst, u8); + } else { + VFRINTRRD_S(q0, q1, round_round[u8 & 3]); + } + break; + case 0x09: + INST_NAME("ROUNDPD Gx, Ex, Ib"); + nextop = F8; + GETEX(q1, 0, 1); + GETGX_empty(q0); + u8 = F8; + v1 = fpu_get_scratch(dyn); + if (u8 & 4) { + u8 = sse_setround(dyn, ninst, x1, x2); + VFRINT_D(q0, q1); + x87_restoreround(dyn, ninst, u8); + } else { + VFRINTRRD_D(q0, q1, round_round[u8 & 3]); + } + break; + case 0x0A: + INST_NAME("ROUNDSS Gx, Ex, Ib"); + nextop = F8; + GETGX(q0, 1); + GETEXSS(q1, 0, 1); + u8 = F8; + v1 = fpu_get_scratch(dyn); + if (u8 & 4) { + u8 = sse_setround(dyn, ninst, x1, x2); + VFRINT_S(v1, q1); + x87_restoreround(dyn, ninst, u8); + } else { + VFRINTRRD_S(v1, q1, round_round[u8 & 3]); + } + VEXTRINS_W(q0, v1, 0); + break; case 0x0B: INST_NAME("ROUNDSD Gx, Ex, Ib"); nextop = F8; @@ -728,6 +798,39 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int } VEXTRINS_D(q0, v1, 0); break; + case 0x0C: + INST_NAME("BLENDPS Gx, Ex, Ib"); + nextop = F8; + GETGX(q0, 1); + GETEXSS(q1, 0, 1); + u8 = F8 & 0b1111; + if ((u8 & 0b11) == 0b11) { + VEXTRINS_D(q0, q1, 0); + u8 &= ~0b0011; + } + if ((u8 & 0b1100) == 0b1100) { + VEXTRINS_D(q0, q1, 0b00010001); + u8 &= ~0b1100; + } + for (int i = 0; i < 4; ++i) + if (u8 & (1 << i)) { + VEXTRINS_W(q0, q1, (i << 4) | i); + } + break; + case 0x0D: + INST_NAME("BLENDPD Gx, Ex, Ib"); + nextop = F8; + GETGX(q0, 1); + GETEXSD(q1, 0, 1); + u8 = F8 & 0b11; + if (u8 == 0b01) { + VEXTRINS_D(q0, q1, 0b00000000); + } else if (u8 == 0b10) { + VEXTRINS_D(q0, q1, 0b00010001); + } else if (u8 == 0b11) { + VOR_V(q0, q1, q1); + } + break; case 0x0F: INST_NAME("PALIGNR Gx, Ex, Ib"); nextop = F8; @@ -844,6 +947,35 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETED(1); u8 = F8; VINSGR2VR_B(q0, ed, (u8 & 0xf)); + break; + case 0x21: + INST_NAME("INSERTPS Gx, Ex, Ib"); + nextop = F8; + GETGX(q0, 1); + u8 = F8; + uint8_t src_index = (u8 >> 6) & 3; + uint8_t dst_index = (u8 >> 4) & 3; + uint8_t zmask = u8 & 0xf; + q2 = fpu_get_scratch(dyn); + if (MODREG) { + q1 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0); + VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(dst_index, src_index)); + } else { + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x5, &fixedaddress, rex, NULL, 0, 1); + u8 = F8; + FLD_S(q2, wback, fixedaddress); + VEXTRINS_W(q0, q2, VEXTRINS_IMM_4_0(dst_index, 0)); // src index is zero when Ex is mem operand + } + VXOR_V(q2, q2, q2); + if (zmask) { + for (uint8_t i = 0; i < 4; i++) { + if (zmask & (1 << i)) { + VEXTRINS_W(q0, q2, VEXTRINS_IMM_4_0(i, 0)); + } + } + } + break; case 0x22: INST_NAME("PINSRD Gx, ED, Ib"); @@ -1214,14 +1346,15 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETGX(v0, 1); GETEX(v1, 0, 0); - if (BOX64ENV(dynarec_fastnan)) { - VFMIN_D(v0, v0, v1); - } else { + // LoongArch FMIN/FMAX follow IEEE 754-2008 , it wll copy the none NaN value. + // If both NaN, then copy NaN. + // but x86 will copy the second if either v0[x] or v1[x] is NaN + if (!BOX64ENV(dynarec_fastnan)) { q0 = fpu_get_scratch(dyn); - VFCMP_D(q0, v0, v1, sLT); - VAND_V(v0, v0, q0); - VANDN_V(q0, q0, v1); - VOR_V(v0, v0, q0); + VFCMP_D(q0, v1, v0, cULE); + VBITSEL_V(v0, v0, v1, q0); + } else { + VFMIN_D(v0, v0, v1); } break; case 0x5E: @@ -1250,16 +1383,21 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int nextop = F8; GETGX(v0, 1); GETEX(v1, 0, 0); - if (BOX64ENV(dynarec_fastnan)) { - VFMAX_D(v0, v0, v1); - } else { + // LoongArch FMIN/FMAX follow IEEE 754-2008 , it wll copy the none NaN value. + // If both NaN, then copy NaN. + // but x86 will copy the second if either v0[x] or v1[x] is NaN + if (!BOX64ENV(dynarec_fastnan) && v0 != v1) { q0 = fpu_get_scratch(dyn); - VFCMP_D(q0, v1, v0, sLT); - VAND_V(v0, v0, q0); - VANDN_V(q0, q0, v1); - VOR_V(v0, v0, q0); + q1 = fpu_get_scratch(dyn); + VFCMP_D(q0, v1, v0, cUEQ); // un eq , if either v0/v1=nan ,choose v1. if eq either is ok,but when +0.0 == -0.0 x86 sse choose v1 + VFCMP_D(q1, v0, v1, cLT); + VOR_V(q0, q0, q1); + VBITSEL_V(v0, v0, v1, q0); + } else { + VFMAX_D(v0, v0, v1); } break; + case 0x60: INST_NAME("PUNPCKLBW Gx,Ex"); nextop = F8; @@ -1767,6 +1905,23 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int } BSTRINS_D(gd, x1, 15, 0); break; + case 0xC2: + INST_NAME("CMPPD Gx, Ex, Ib"); + nextop = F8; + GETGX(v0, 1); + GETEX(v1, 0, 1); + u8 = F8; + switch (u8 & 7) { + case 0: VFCMP_D(v0, v0, v1, cEQ); break; // Equal => cEQ True if EQ + case 1: VFCMP_D(v0, v0, v1, cLT); break; // Less than => cLT True if EQ LT + case 2: VFCMP_D(v0, v0, v1, cLE); break; // Less or equal => CLE True if LT EQ + case 3: VFCMP_D(v0, v0, v0, cUN); break; // unordered => CUN True if UN + case 4: VFCMP_D(v0, v0, v1, cUNE); break; // Not Equal or unordered True if UN LT GT + case 5: VFCMP_D(v0, v1, v0, cULE); break; // grether or equal or un True if UN EQ GT, use cULE UN LT, swap v0 v1 + case 6: VFCMP_D(v0, v1, v0, cULT); break; // Greater or unordered True if UN GT, use cULE UN EQ LT, swap v0 v1 + case 7: VFCMP_D(v0, v0, v0, cOR); break; // not NaN(ordered) True if LT EQ GT + } + break; case 0xC4: INST_NAME("PINSRW Gx, Ed, Ib"); nextop = F8; diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h index 39e53ff94..7b7fdc783 100644 --- a/src/dynarec/la64/la64_emitter.h +++ b/src/dynarec/la64/la64_emitter.h @@ -1443,7 +1443,10 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VSLT_B(vd, vj, vk) EMIT(type_3R(0b01110000000001100, vk, vj, vd)) #define VSLT_H(vd, vj, vk) EMIT(type_3R(0b01110000000001101, vk, vj, vd)) #define VSLT_W(vd, vj, vk) EMIT(type_3R(0b01110000000001110, vk, vj, vd)) +#define VSLTI_B(vd, vj, imm5) EMIT(type_2RI5(0b01110010100001100, imm5, vj, vd)) +#define VSLTI_H(vd, vj, imm5) EMIT(type_2RI5(0b01110010100001101, imm5, vj, vd)) #define VSLTI_W(vd, vj, imm5) EMIT(type_2RI5(0b01110010100001110, imm5, vj, vd)) +#define VSLTI_D(vd, vj, imm5) EMIT(type_2RI5(0b01110010100001111, imm5, vj, vd)) #define VSLT_D(vd, vj, vk) EMIT(type_3R(0b01110000000001111, vk, vj, vd)) #define VSLT_BU(vd, vj, vk) EMIT(type_3R(0b01110000000010000, vk, vj, vd)) #define VSLT_HU(vd, vj, vk) EMIT(type_3R(0b01110000000010001, vk, vj, vd)) @@ -1916,7 +1919,9 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VPICKVE2GR_DU(rd, vj, imm1) EMIT(type_2RI1(0b011100101111001111110, imm1, vj, rd)) #define VFRINT_S(vd, vj) EMIT(type_2R(0b0111001010011101001101, vj, vd)) #define VFRINT_D(vd, vj) EMIT(type_2R(0b0111001010011101001110, vj, vd)) -#define VFRINTRRD_D(vd, vj, imm4) EMIT(type_2RI4(0b011100101001110101, imm4, vj, vd)) +// vfrintrm.d 0010 vfrintrm.s 0001, vfrintrne.d 1110 vfrintrne.s 1101... +#define VFRINTRRD_S(vd, vj, rm) EMIT(type_2RI4(0b011100101001110101, ((rm & 0b11) << 2) | 0b01, vj, vd)) +#define VFRINTRRD_D(vd, vj, rm) EMIT(type_2RI4(0b011100101001110101, ((rm & 0b11) << 2) | 0b10, vj, vd)) #define VREPLGR2VR_B(vd, rj) EMIT(type_2R(0b0111001010011111000000, rj, vd)) #define VREPLGR2VR_H(vd, rj) EMIT(type_2R(0b0111001010011111000001, rj, vd)) #define VREPLGR2VR_W(vd, rj) EMIT(type_2R(0b0111001010011111000010, rj, vd)) diff --git a/src/dynarec/la64/la64_printer.c b/src/dynarec/la64/la64_printer.c index 93aab1a39..54b3ad4d0 100644 --- a/src/dynarec/la64/la64_printer.c +++ b/src/dynarec/la64/la64_printer.c @@ -2204,6 +2204,98 @@ const char* la64_print(uint32_t opcode, uintptr_t addr) snprintf(buff, sizeof(buff), "%-15s %s, %s, %ld", "VST", Vt[Rd], Xt[Rj], signExtend(imm, 12)); return buff; } + if (isMask(opcode, "000011010001aaaaakkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "VBITSEL.V", Vt[Rd], Vt[Rj], Vt[Rk], Vt[Ra]); + return buff; + } + if (isMask(opcode, "00001100010100000kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CAF.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010100001kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SAF.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010100010kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CLT.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010100011kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SLT.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010100100kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CEQ.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010100101kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SEQ.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010100110kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CLE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010100111kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SLE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101000kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CUN.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101001kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SUN.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101010kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CULT.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101011kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SULT.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101100kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CUEQ.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101101kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SUEQ.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101110kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CULE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010101111kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SULE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010110000kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CNE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010110001kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SNE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010110100kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.COR.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010110101kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SOR.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010111000kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.CUNE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } + if (isMask(opcode, "00001100010111001kkkkkjjjjjddddd", &a)) { + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "VFCMP.SUNE.S", Vt[Rd], Vt[Rj], Vt[Rk]); + return buff; + } if (isMask(opcode, "00000000000000001000000000101000", &a)) { snprintf(buff, sizeof(buff), "X64CLRSM"); return buff;