[LA64_DYNAREC] Add SSE1/SSE2's cvt mmx ops. (#2538)

This commit is contained in:
phorcys
2025-04-17 14:12:28 +08:00
committed by GitHub
parent 46a91e0ad9
commit 9579dd9ff1
2 changed files with 166 additions and 0 deletions

View File

@@ -283,6 +283,17 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
SMWRITE2(); SMWRITE2();
} }
break; break;
case 0x2A:
INST_NAME("CVTPI2PS Gx,Em");
nextop = F8;
GETGX(v0, 1);
GETEM(v1, 0);
q0 = fpu_get_scratch(dyn);
u8 = sse_setround(dyn, ninst, x1, x2);
VFFINT_S_W(q0, v1);
x87_restoreround(dyn, ninst, u8);
VEXTRINS_D(v0, q0, VEXTRINS_IMM_4_0(0, 0));
break;
case 0x2B: case 0x2B:
INST_NAME("MOVNTPS Ex,Gx"); INST_NAME("MOVNTPS Ex,Gx");
nextop = F8; nextop = F8;
@@ -297,6 +308,73 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
VST(v0, ed, fixedaddress); VST(v0, ed, fixedaddress);
} }
break; break;
case 0x2C:
INST_NAME("CVTTPS2PI Gm,Ex");
nextop = F8;
GETGM(v0);
GETEX(v1, 0, 0);
if (BOX64ENV(dynarec_fastround)) {
VFTINTRZ_W_S(v0, v1);
} else {
MOVGR2FCSR(FCSR2, xZR); // reset all bits
VFTINTRZ_W_S(v0, v1);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
MOV32w(x3, (1 << FR_V) | (1 << FR_O));
AND(x5, x5, x3);
BEQZ_MARK3(x5); // no fp exception, work done.
// check +/-Nan, +overlow ,replace with 0x80000000
q0 = fpu_get_scratch(dyn);
q1 = fpu_get_scratch(dyn); // mask
d0 = fpu_get_scratch(dyn);
VLDI(q0, 0b1001110000000); // broadcast 0x80000000 to all
VLDI(d0, (0b10011 << 8) | 0x4f);
VFCMP_S(q1, d0, v1, cULE); // get Nan,+overflow mark
VBITSEL_V(v0, v0, q0, q1);
MARK3;
}
break;
case 0x2D:
INST_NAME("CVTPS2PI Gm, Ex");
nextop = F8;
GETGM(v0);
GETEX(v1, 0, 0);
u8 = sse_setround(dyn, ninst, x4, x6);
if (BOX64ENV(dynarec_fastround)) {
VFTINTRZ_W_S(v0, v1);
} else {
MOVGR2FCSR(FCSR2, xZR); // reset all bits
VFTINT_W_S(v0, v1);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
MOV32w(x3, (1 << FR_V) | (1 << FR_O));
AND(x5, x5, x3);
BEQZ_MARK3(x5); // no fp exception, work done, fast path.
// check +/-Nan, +overlow ,replace with 0x80000000
/* LoongArch follow IEEE754-2008,
if val < -2147483648.0f got -2147483648 match sse
if var > 2147483648.0f got 2147483647 need mask
but lucky _Float32 is not accurate:
-2147483648.0f is 0xcf000000 (_Float32)
-2147483520.0f is 0xceffffff (_Float32)
2147483648.0f is 0x4f000000 (_Float32)
2147483520.0f is 0x4effffff (_Float32)
combine (unorder || gt 0x4f000000)
use cULE for (unodered || 0x4f000000 <= v1[x])
*/
q0 = fpu_get_scratch(dyn);
q1 = fpu_get_scratch(dyn); // mask
d0 = fpu_get_scratch(dyn);
VLDI(q0, 0b1001110000000); // broadcast 0x80000000 to all
VLDI(d0, (0b10011 << 8) | 0x4f);
VFCMP_S(q1, d0, v1, cULE); // get Nan,+overflow mark
VBITSEL_V(v0, v0, q0, q1);
MARK3;
}
x87_restoreround(dyn, ninst, u8);
break;
case 0x2E: case 0x2E:
// no special check... // no special check...
case 0x2F: case 0x2F:

View File

@@ -208,6 +208,14 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
SMWRITE2(); SMWRITE2();
} }
break; break;
case 0x2A:
INST_NAME("CVTPI2PD Gx,Em");
nextop = F8;
GETGX(v0, 1);
GETEM(v1, 0);
q0 = fpu_get_scratch(dyn);
VFFINTL_D_W(v0, v1);
break;
case 0x2B: case 0x2B:
INST_NAME("MOVNTPD Ex,Gx"); INST_NAME("MOVNTPD Ex,Gx");
nextop = F8; nextop = F8;
@@ -222,6 +230,86 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
VST(v0, ed, fixedaddress); VST(v0, ed, fixedaddress);
} }
break; break;
case 0x2C:
INST_NAME("CVTTPD2PI Gm,Ex");
nextop = F8;
GETGM(v0);
GETEX(v1, 0, 0);
if (BOX64ENV(dynarec_fastround)) {
VFTINTRZ_W_D(v0, v1, v1);
} else {
MOVGR2FCSR(FCSR2, xZR); // reset all bits
VFTINTRZ_W_D(v0, v1, v1);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
MOV32w(x3, (1 << FR_V) | (1 << FR_O));
AND(x5, x5, x3);
BEQZ_MARK3(x5); // no fp exception, work done.
q0 = fpu_get_scratch(dyn);
MOVGR2FCSR(FCSR2, xZR); // reset all bits
FTINTRZ_W_D(v0, v1);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
AND(x5, x5, x3);
BEQZ_MARK(x5);
MOV32w(x1, 0x80000000);
MOVGR2FR_W(v0, x1);
MARK;
MOVGR2FCSR(FCSR2, xZR); // reset all bits
VSHUF4I_W(q0, v1, 0b1110); // get v1 high 64bits
FTINTRZ_W_D(q0, q0);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
AND(x5, x5, x3);
BEQZ_MARK2(x5);
MOV32w(x1, 0x80000000);
MOVGR2FRH_W(v0, x1);
B_MARK3_nocond;
MARK2;
VEXTRINS_W(v0, q0, VEXTRINS_IMM_4_0(1, 0));
MARK3;
}
break;
case 0x2D:
INST_NAME("CVTPD2PI Gm,Ex");
nextop = F8;
GETGM(v0);
GETEX(v1, 0, 0);
u8 = sse_setround(dyn, ninst, x4, x6);
if (BOX64ENV(dynarec_fastround)) {
VFTINT_W_D(v0, v1, v1);
} else {
MOVGR2FCSR(FCSR2, xZR); // reset all bits
VFTINT_W_D(v0, v1, v1);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
MOV32w(x3, (1 << FR_V) | (1 << FR_O));
AND(x5, x5, x3);
BEQZ_MARK3(x5); // no fp exception, work done.
q0 = fpu_get_scratch(dyn);
MOVGR2FCSR(FCSR2, xZR); // reset all bits
FTINT_W_D(v0, v1);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
AND(x5, x5, x3);
BEQZ_MARK(x5);
MOV32w(x1, 0x80000000);
MOVGR2FR_W(v0, x1);
MARK;
MOVGR2FCSR(FCSR2, xZR); // reset all bits
VSHUF4I_W(q0, v1, 0b1110); // get v1 high 64bits
FTINT_W_D(q0, q0);
MOVFCSR2GR(x5, FCSR2); // get back FPSR to check
AND(x5, x5, x3);
BEQZ_MARK2(x5);
MOV32w(x1, 0x80000000);
MOVGR2FRH_W(v0, x1);
B_MARK3_nocond;
MARK2;
VEXTRINS_W(v0, q0, VEXTRINS_IMM_4_0(1, 0));
MARK3;
}
x87_restoreround(dyn, ninst, u8);
break;
case 0x2E: case 0x2E:
// no special check... // no special check...
case 0x2F: case 0x2F: