[LA64_DYNAREC] Add/Opt more mmx/sse ops (#2565)

* [LA64_DYNAREC] Add/Opt PEXTR{B,W,D,Q}/PINSR{B,W,D,Q} .
  * 0f.c4/c5 PINSRW/PEXTRW mmx ops.
  * 66.0f.3a.14/15/16 PEXTR{B,W,D/Q} SSE4 ops.
  * 66.0f.c4/c5 PINSRW/PEXTRW sse ops.

* [LA64_DYNAREC] Add more SSE3/SSE4 ops
66.0f.38.28 PMULDQ
66.0f.38.2a MOVNTDQA
66.0f.38.37 PCMPGTQ
66.0f.38.38/3b/3c/3f PMINSB/PMINUD/PMAXSB/PMAXUD
66.0f.3a.17 EXTRACTPS
66.0f.3a.41 DPPD
opt 66.0f.3a.40 DPPS
This commit is contained in:
phorcys 2025-04-24 19:26:57 +08:00 committed by GitHub
parent 2e90a5dbaf
commit 37dcb1a77b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 180 additions and 46 deletions

View File

@ -567,7 +567,7 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
XVMUL_W(v0, v0, v1);
VSRLI_W(v0, v0, 14);
VADDI_WU(v0, v0, 1);
VSRLNI_H_W(q0, v0, 1);
VSRLNI_H_W(q0, v0, 1);
break;
case 0x1C:
INST_NAME("PABSB Gm,Em");
@ -1922,6 +1922,37 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
SDxw(gd, ed, fixedaddress);
}
break;
case 0xC4:
INST_NAME("PINSRW Gm,Ed,Ib");
nextop = F8;
GETGM(v0);
if (MODREG) {
u8 = (F8) & 3;
ed = TO_NAT((nextop & 7) + (rex.b << 3));
} else {
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &wback, x2, x4, &fixedaddress, rex, NULL, 1, 1);
u8 = (F8) & 3;
ed = x3;
LD_HU(ed, wback, fixedaddress);
}
VINSGR2VR_H(v0, ed, u8);
break;
case 0xC5:
INST_NAME("PEXTRW Gd,Em,Ib");
nextop = F8;
GETGD;
if (MODREG) {
GETEM(v0, 1);
u8 = (F8) & 3;
VPICKVE2GR_HU(gd, v0, u8);
} else {
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &wback, x2, x4, &fixedaddress, rex, NULL, 0, 1);
u8 = (F8) & 3;
LD_HU(gd, wback, (u8 << 1));
}
break;
case 0xC6:
INST_NAME("SHUFPS Gx, Ex, Ib");
nextop = F8;

View File

@ -628,6 +628,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
GETGX_empty(q0);
VSLLWIL_D_W(q0, q1, 0);
break;
case 0x28:
INST_NAME("PMULDQ Gx, Ex");
nextop = F8;
GETGX(q0, 1);
GETEX(q1, 0, 0);
VMULWEV_D_W(q0, q0, q1);
break;
case 0x29:
INST_NAME("PCMPEQQ Gx, Ex"); // SSE4 opcode!
nextop = F8;
@ -635,6 +642,20 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
GETGX_empty(q0);
VSEQ_D(q0, q0, q1);
break;
case 0x2A:
INST_NAME("MOVNTDQA Gx, Ex");
nextop = F8;
if (MODREG) {
v1 = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0);
GETGX_empty(v0);
VOR_V(v0, v1, v1);
} else {
GETGX_empty(v0);
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
VLD(v0, ed, fixedaddress);
}
break;
case 0x2B:
INST_NAME("PACKUSDW Gx, Ex"); // SSE4 opcode!
nextop = F8;
@ -700,6 +721,20 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
GETGX_empty(q0);
VSLLWIL_DU_WU(q0, q1, 0);
break;
case 0x37:
INST_NAME("PCMPGTQ Gx, Ex"); // SSE4 opcode!
nextop = F8;
GETGX(q0, 1);
GETEX(q1, 0, 0);
VSLT_D(q0, q1, q0);
break;
case 0x38:
INST_NAME("PMINSB Gx, Ex"); // SSE4 opcode!
nextop = F8;
GETEX(q1, 0, 0);
GETGX(q0, 1);
VMIN_B(q0, q0, q1);
break;
case 0x39:
INST_NAME("PMINSD Gx, Ex"); // SSE4 opcode!
nextop = F8;
@ -714,6 +749,20 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
GETGX(q0, 1);
VMIN_HU(q0, q0, q1);
break;
case 0x3B:
INST_NAME("PMINUD Gx, Ex"); // SSE4 opcode!
nextop = F8;
GETEX(q1, 0, 0);
GETGX(q0, 1);
VMIN_WU(q0, q0, q1);
break;
case 0x3C:
INST_NAME("PMAXSB Gx, Ex"); // SSE4 opcode!
nextop = F8;
GETEX(q1, 0, 0);
GETGX(q0, 1);
VMAX_B(q0, q0, q1);
break;
case 0x3D:
INST_NAME("PMAXSD Gx, Ex"); // SSE4 opcode!
nextop = F8;
@ -728,6 +777,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
GETGX(q0, 1);
VMAX_HU(q0, q0, q1);
break;
case 0x3F:
INST_NAME("PMAXUD Gx, Ex"); // SSE4 opcode!
nextop = F8;
GETEX(q1, 0, 0);
GETGX(q0, 1);
VMAX_WU(q0, q0, q1);
break;
case 0x40:
INST_NAME("PMULLD Gx, Ex"); // SSE4 opcode!
nextop = F8;
@ -1008,6 +1064,38 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
}
}
break;
case 0x14:
INST_NAME("PEXTRB Ed, Gx, Ib");
nextop = F8;
GETGX(q0, 0);
if (MODREG) {
ed = TO_NAT((nextop & 7) + (rex.b << 3));
u8 = (F8) & 15;
VPICKVE2GR_BU(ed, q0, u8);
} else {
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &wback, x2, x4, &fixedaddress, rex, NULL, 1, 1);
u8 = (F8) & 15;
VPICKVE2GR_BU(x1, q0, u8);
ST_B(x1, wback, fixedaddress);
}
break;
case 0x15:
INST_NAME("PEXTRW Ed, Gx, Ib");
nextop = F8;
GETGX(q0, 0);
if (MODREG) {
ed = TO_NAT((nextop & 7) + (rex.b << 3));
u8 = (F8) & 7;
VPICKVE2GR_HU(ed, q0, u8);
} else {
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &wback, x2, x4, &fixedaddress, rex, NULL, 1, 1);
u8 = (F8) & 7;
VPICKVE2GR_HU(x1, q0, u8);
ST_H(x1, wback, fixedaddress);
}
break;
case 0x16:
if (rex.w) {
INST_NAME("PEXTRQ Ed, Gx, Ib");
@ -1021,12 +1109,9 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
ed = TO_NAT((nextop & 7) + (rex.b << 3));
u8 = F8;
if (rex.w) {
VBSRL_V(d0, q0, (u8 & 1) * 8);
MOVFR2GR_D(ed, d0);
VPICKVE2GR_D(ed, q0, (u8 & 1));
} else {
VBSRL_V(d0, q0, (u8 & 3) * 4);
MOVFR2GR_S(ed, d0);
ZEROUP(ed);
VPICKVE2GR_W(ed, q0, (u8 & 3));
}
} else {
addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 1, 1);
@ -1041,6 +1126,23 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
SMWRITE2();
}
break;
case 0x17:
INST_NAME("EXTRACTPS Ew, Gx, Ib");
nextop = F8;
GETGX(q0, 0);
if (MODREG) {
ed = TO_NAT((nextop & 7) + (rex.b << 3));
u8 = F8 & 0b11;
VPICKVE2GR_WU(ed, q0, u8);
} else {
addr = geted(dyn, addr, ninst, nextop, &ed, x3, x5, &fixedaddress, rex, NULL, 1, 1);
u8 = F8 & 0b11;
d0 = fpu_get_scratch(dyn);
VBSRL_V(d0, q0, (u8 & 3) * 4);
FST_S(d0, ed, fixedaddress);
SMWRITE2();
}
break;
case 0x20:
INST_NAME("PINSRB Gx, ED, Ib");
nextop = F8;
@ -1098,43 +1200,48 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
u8 = F8;
v0 = fpu_get_scratch(dyn);
v1 = fpu_get_scratch(dyn);
VXOR_V(v0, v0, v0);
VXOR_V(v1, v1, v1);
if (u8 & 0x10) {
VEXTRINS_W(v0, q0, 0x00);
VEXTRINS_W(v1, q1, 0x00);
v2 = fpu_get_scratch(dyn);
VFMUL_S(v0, q0, q1);
VXOR_V(v2, v2, v2);
for (int i = 0; i < 4; ++i) {
if (!(u8 & (1 << (4 + i)))) {
VEXTRINS_W(v0, v2, (i << 4));
}
}
if (u8 & 0x20) {
VEXTRINS_W(v0, q0, 0x11);
VEXTRINS_W(v1, q1, 0x11);
VSHUF4I_W(v1, v0, 0b10110001); // v0[a,b,c,d] v1[b,a,d,c]
VFADD_S(v0, v0, v1); // v0[ab,ba,cd,dc]
VSHUF4I_W(v1, v0, 0b01001110); // v1[cd,dc,ab,ba]
VFADD_S(v0, v0, v1); // v0[abcd,badc,cdab,dcba]
VREPLVEI_W(q0, v0, 0);
for (int i = 0; i < 4; ++i) {
if (!(u8 & (1 << i))) {
VEXTRINS_W(q0, v2, (i << 4));
}
}
if (u8 & 0x40) {
VEXTRINS_W(v0, q0, 0x22);
VEXTRINS_W(v1, q1, 0x22);
break;
case 0x41:
INST_NAME("DPPD Gx, Ex, Ib");
nextop = F8;
GETGX(q0, 1);
GETEX(q1, 0, 1);
u8 = F8;
v0 = fpu_get_scratch(dyn);
v1 = fpu_get_scratch(dyn);
v2 = fpu_get_scratch(dyn);
VFMUL_D(v0, q0, q1);
VXOR_V(v2, v2, v2);
for (int i = 0; i < 2; ++i) {
if (!(u8 & (1 << (4 + i)))) {
VEXTRINS_D(v0, v2, (i << 4));
}
}
if (u8 & 0x80) {
VEXTRINS_W(v0, q0, 0x33);
VEXTRINS_W(v1, q1, 0x33);
}
VFMUL_S(v0, v0, v1);
VPACKOD_W(v1, v0, v0);
VPACKEV_W(v0, v0, v0);
VFADD_S(v0, v0, v1);
VPACKOD_D(v1, v0, v0);
VPACKEV_D(v0, v0, v0);
VFADD_S(v0, v0, v1);
VXOR_V(q0, q0, q0);
if (u8 & 0x1) {
VEXTRINS_W(q0, v0, 0x00);
}
if (u8 & 0x2) {
VEXTRINS_W(q0, v0, 0x11);
}
if (u8 & 0x4) {
VEXTRINS_W(q0, v0, 0x22);
}
if (u8 & 0x8) {
VEXTRINS_W(q0, v0, 0x33);
VSHUF4I_W(v1, v0, 0b01001110); // v0[a,b] v1[b,a]
VFADD_D(v0, v0, v1); // v0[ab,ba]
VREPLVEI_D(q0, v0, 0);
for (int i = 0; i < 2; ++i) {
if (!(u8 & (1 << i))) {
VEXTRINS_D(q0, v2, (i << 4));
}
}
break;
case 0x44:
@ -2059,9 +2166,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
ed = x3;
LD_HU(ed, wback, fixedaddress);
}
d0 = fpu_get_scratch(dyn);
MOVGR2FR_D(d0, ed);
VEXTRINS_H(v0, d0, (u8 << 4));
VINSGR2VR_H(v0, ed, u8);
break;
case 0xC5:
INST_NAME("PEXTRW Gd, Ex, Ib");
@ -2071,9 +2176,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int
GETEX(v0, 0, 1);
u8 = (F8) & 7;
v1 = fpu_get_scratch(dyn);
VBSRL_V(v1, v0, (u8 << 1));
MOVFR2GR_D(gd, v1);
BSTRPICK_D(gd, gd, 15, 0);
VPICKVE2GR_HU(gd, v0, u8);
} else {
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &wback, x2, x4, &fixedaddress, rex, NULL, 1, 1);