[DYNAREC] Introduce BOX64_DYNAREC_X87DOUBLE=2 to handle Low Precision x87 ([ARM64_DYNAREC] only for now)

This commit is contained in:
ptitSeb 2025-04-16 14:53:54 +02:00
parent 7f569247d5
commit d7eb87129e
22 changed files with 130 additions and 23 deletions

View File

@ -218,6 +218,7 @@ Force the use of float/double for x87 emulation.
* 0: Try to use float when possible for x87 emulation. [Default]
* 1: Only use Double for x87 emulation.
* 2: Check Precision Control low precision on x87 emulation.
### BOX64_EXIT

View File

@ -339,12 +339,13 @@ Tweak the memory barriers to reduce the performance impact by strong memory emua
* 2 : All in 1, plus disabled the last write barriers.
=item B<BOX64_DYNAREC_X87DOUBLE> =I<0|1>
=item B<BOX64_DYNAREC_X87DOUBLE> =I<0|1|2>
Force the use of float/double for x87 emulation.
* 0 : Try to use float when possible for x87 emulation. [Default]
* 1 : Only use Double for x87 emulation.
* 2 : Check Precision Control low precision on x87 emulation.
=item B<BOX64_EMULATED_LIBS> =I<XXXX|XXXX:YYYY:ZZZZ>

View File

@ -681,6 +681,11 @@
"key": "1",
"description": "Only use Double for x87 emulation.",
"default": false
},
{
"key": "2",
"description": "Check Precision Control low precision on x87 emulation.",
"default": false
}
]
},

View File

@ -89,7 +89,8 @@ p0-p3 are used to pass scalable predicate arguments to a subroutine and to retur
#define x4 4
#define x5 5
#define x6 6
#define x7 7
#define x87pc 7
// x87 can be a scratch, but check if it's used as x87 PC and restore if needed in that case
// 32bits version of scratch
#define w1 x1
#define w2 x2
@ -97,7 +98,7 @@ p0-p3 are used to pass scalable predicate arguments to a subroutine and to retur
#define w4 x4
#define w5 x5
#define w6 x6
#define w7 x7
#define w87pc x87pc
// emu is r0
#define xEmu 0
// ARM64 LR

View File

@ -1420,8 +1420,8 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
CSELw(x2, x2, x3, cLT); // x2 is min(lmem, lreg)
// x2 is min length 0-n_packed
MVNw_REG(x4, xZR);
LSLw_REG(x7, x4, x2);
BICw_REG(x1, x1, x7);
LSLw_REG(x87pc, x4, x2);
BICw_REG(x1, x1, x87pc);
LSLw_REG(x4, x4, x5);
ORRw_REG(x1, x1, x4);
ANDw_mask(x1, x1, 0, (u8&1)?7:15);
@ -1474,6 +1474,7 @@ uintptr_t dynarec64_660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int n
BFCw(xFlags, F_PF, 1);
}
}
ARM64_CHECK_PRECISION(); // to regen x87 if it has been used
} else {
SETFLAGS(X_ALL, SF_SET_DF);
if(gd>7) // no need to reflect cache as xmm0-xmm7 will be saved before the function call anyway

View File

@ -56,6 +56,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FADDS(v1, v1, v2);
} else {
FADDD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -77,6 +78,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FMULS(v1, v1, v2);
} else {
FMULD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -135,6 +137,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FSUBS(v1, v1, v2);
} else {
FSUBD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -156,6 +159,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FSUBS(v1, v2, v1);
} else {
FSUBD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -177,6 +181,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FDIVS(v1, v1, v2);
} else {
FDIVD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -198,6 +203,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FDIVS(v1, v2, v1);
} else {
FDIVD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -220,6 +226,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FADDD(v1, v1, s0);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -237,6 +244,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FMULD(v1, v1, s0);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -252,6 +260,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FCMPD(v1, s0);
X87_CHECK_PRECISION(v1);
}
FCOM(x1, x2, x3);
break;
@ -266,6 +275,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FCMPD(v1, s0);
X87_CHECK_PRECISION(v1);
}
FCOM(x1, x2, x3);
X87_POP_OR_FAIL(dyn, ninst, x3);
@ -283,6 +293,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FSUBD(v1, v1, s0);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -300,6 +311,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FSUBD(v1, s0, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -317,6 +329,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FDIVD(v1, v1, s0);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -334,6 +347,7 @@ uintptr_t dynarec64_D8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
} else {
FCVT_D_S(s0, s0);
FDIVD(v1, s0, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);

View File

@ -430,6 +430,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FSQRTS(v1, v1);
} else {
FSQRTD(v1, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -509,7 +510,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
switch((nextop>>3)&7) {
case 0:
INST_NAME("FLD ST0, float[ED]");
X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, BOX64ENV(dynarec_x87double)?NEON_CACHE_ST_D:NEON_CACHE_ST_F);
X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, (BOX64ENV(dynarec_x87double)==1)?NEON_CACHE_ST_D:NEON_CACHE_ST_F);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, &unscaled, 0xfff<<2, 3, rex, NULL, 0, 0);
VLD32(v1, ed, fixedaddress);
if(!ST_IS_F(0)) {

View File

@ -150,6 +150,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x5, x4);
FADDD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -164,6 +165,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x5, x4);
FMULD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -201,6 +203,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x5, x4);
FSUBD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -215,6 +218,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x5, x4);
FSUBD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -229,6 +233,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x5, x4);
FDIVD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -243,6 +248,7 @@ uintptr_t dynarec64_DA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x5, x4);
FDIVD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;

View File

@ -54,6 +54,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FADDS(v1, v1, v2);
} else {
FADDD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -75,6 +76,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FMULS(v1, v1, v2);
} else {
FMULD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -94,6 +96,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FCMPS(v1, v2);
} else {
FCMPD(v1, v2);
X87_CHECK_PRECISION(v1);
}
FCOM(x1, x2, x3);
break;
@ -112,6 +115,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FCMPS(v1, v2);
} else {
FCMPD(v1, v2);
X87_CHECK_PRECISION(v1);
}
FCOM(x1, x2, x3);
X87_POP_OR_FAIL(dyn, ninst, x3);
@ -133,6 +137,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FSUBS(v1, v2, v1);
} else {
FSUBD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -154,6 +159,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FSUBS(v1, v1, v2);
} else {
FSUBD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -175,6 +181,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FDIVS(v1, v2, v1);
} else {
FDIVD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -196,6 +203,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FDIVS(v1, v1, v2);
} else {
FDIVD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -214,6 +222,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x2, x4);
FADDD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -226,6 +235,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x2, x4);
FMULD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -257,6 +267,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x2, x4);
FSUBD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -269,6 +280,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x2, x4);
FSUBD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -281,6 +293,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x2, x4);
FDIVD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;
@ -293,6 +306,7 @@ uintptr_t dynarec64_DC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
if(!BOX64ENV(dynarec_fastround))
u8 = x87_setround(dyn, ninst, x1, x2, x4);
FDIVD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
break;

View File

@ -54,6 +54,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FADDS(v1, v1, v2);
} else {
FADDD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -76,6 +77,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FMULS(v1, v1, v2);
} else {
FMULD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -130,6 +132,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FSUBS(v1, v2, v1);
} else {
FSUBD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -152,6 +155,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FSUBS(v1, v1, v2);
} else {
FSUBD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -174,6 +178,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FDIVS(v1, v2, v1);
} else {
FDIVD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -196,6 +201,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
FDIVS(v1, v1, v2);
} else {
FDIVD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
}
if(!BOX64ENV(dynarec_fastround))
x87_restoreround(dyn, ninst, u8);
@ -216,6 +222,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
SXTL_32(v2, v2);
SCVTFDD(v2, v2);
FADDD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
break;
case 1:
INST_NAME("FIMUL ST0, word[ED]");
@ -227,6 +234,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
SXTL_32(v2, v2);
SCVTFDD(v2, v2);
FMULD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
break;
case 2:
INST_NAME("FICOM ST0, word[ED]");
@ -263,6 +271,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
SXTL_32(v2, v2);
SCVTFDD(v2, v2);
FSUBD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
break;
case 5:
INST_NAME("FISUBR ST0, word[ED]");
@ -274,6 +283,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
SXTL_32(v2, v2);
SCVTFDD(v2, v2);
FSUBD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
break;
case 6:
INST_NAME("FIDIV ST0, word[ED]");
@ -285,6 +295,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
SXTL_32(v2, v2);
SCVTFDD(v2, v2);
FDIVD(v1, v1, v2);
X87_CHECK_PRECISION(v1);
break;
case 7:
INST_NAME("FIDIVR ST0, word[ED]");
@ -296,6 +307,7 @@ uintptr_t dynarec64_DE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
SXTL_32(v2, v2);
SCVTFDD(v2, v2);
FDIVD(v1, v2, v1);
X87_CHECK_PRECISION(v1);
break;
}
return addr;

View File

@ -766,7 +766,7 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save
dyn->insts[ninst].nat_flags_op = NAT_FLAG_OP_UNUSABLE;
#endif
if(savereg==0)
savereg = 7;
savereg = x87pc;
if(saveflags) {
STRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags));
}
@ -804,6 +804,9 @@ void call_c(dynarec_arm_t* dyn, int ninst, void* fnc, int reg, int ret, int save
if(saveflags) {
LDRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags));
}
if(reg==x87pc && savereg!=x87pc && dyn->need_x87check) {
ARM64_CHECK_PRECISION(); // regen x87 mask
}
//SET_NODF();
}
@ -813,7 +816,7 @@ void call_i(dynarec_arm_t* dyn, int ninst, void* fnc)
#if STEP == 0
dyn->insts[ninst].nat_flags_op = NAT_FLAG_OP_UNUSABLE;
#endif
STPx_S7_preindex(x6, x7, xSP, -16);
STPx_S7_preindex(x6, x87pc, xSP, -16);
STPx_S7_preindex(x4, x5, xSP, -16);
STPx_S7_preindex(x2, x3, xSP, -16);
STPx_S7_preindex(xEmu, x1, xSP, -16); // ARM64 stack needs to be 16byte aligned
@ -823,10 +826,10 @@ void call_i(dynarec_arm_t* dyn, int ninst, void* fnc)
STPx_S7_offset(xRSI, xRDI, xEmu, offsetof(x64emu_t, regs[_SI]));
STPx_S7_offset(xR8, xR9, xEmu, offsetof(x64emu_t, regs[_R8]));
STRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags));
fpu_pushcache(dyn, ninst, x7, 0);
fpu_pushcache(dyn, ninst, x87pc, 0);
TABLE64(x7, (uintptr_t)fnc);
BLR(x7);
TABLE64(x87pc, (uintptr_t)fnc);
BLR(x87pc);
LDPx_S7_postindex(xEmu, x1, xSP, 16);
LDPx_S7_postindex(x2, x3, xSP, 16);
LDPx_S7_postindex(x4, x5, xSP, 16);
@ -838,8 +841,8 @@ void call_i(dynarec_arm_t* dyn, int ninst, void* fnc)
GO(R8, R9);
#undef GO
LDRx_U12(xFlags, xEmu, offsetof(x64emu_t, eflags));
fpu_popcache(dyn, ninst, x7, 0); // savereg will not be used
LDPx_S7_postindex(x6, x7, xSP, 16);
fpu_popcache(dyn, ninst, x87pc, 0); // savereg will not be used
LDPx_S7_postindex(x6, x87pc, xSP, 16);
//SET_NODF();
}
@ -859,12 +862,12 @@ void call_n(dynarec_arm_t* dyn, int ninst, void* fnc, int w)
if(abs(w)>1) {
MESSAGE(LOG_DUMP, "Getting %d XMM args\n", abs(w)-1);
for(int i=0; i<abs(w)-1; ++i) {
sse_get_reg(dyn, ninst, x7, i, w);
sse_get_reg(dyn, ninst, x3, i, w);
}
}
if(w<0) {
MESSAGE(LOG_DUMP, "Return in XMM0\n");
sse_get_reg_empty(dyn, ninst, x7, 0);
sse_get_reg_empty(dyn, ninst, x3, 0);
}
// prepare regs for native call
MOVx_REG(0, xRDI);

View File

@ -721,13 +721,13 @@
// CALL will use x7 for the call address. Return value can be put in ret (unless ret is -1)
// R0 will not be pushed/popd if ret is -2
#define CALL(F, ret) call_c(dyn, ninst, F, x7, ret, 1, 0)
#define CALL(F, ret) call_c(dyn, ninst, F, x87pc, ret, 1, 0)
// CALL_ will use x7 for the call address. Return value can be put in ret (unless ret is -1)
// R0 will not be pushed/popd if ret is -2
#define CALL_(F, ret, reg) call_c(dyn, ninst, F, x7, ret, 1, reg)
#define CALL_(F, ret, reg) call_c(dyn, ninst, F, x87pc, ret, 1, reg)
// CALL_S will use x7 for the call address. Return value can be put in ret (unless ret is -1)
// R0 will not be pushed/popd if ret is -2. Flags are not save/restored
#define CALL_S(F, ret) call_c(dyn, ninst, F, x7, ret, 0, 0)
#define CALL_S(F, ret) call_c(dyn, ninst, F, x87pc, ret, 0, 0)
// CALL_ will use x7 for the call address.
// All regs are saved, including scratch. This is use to call internal function that should not change state
#define CALL_I(F) call_i(dyn, ninst, F)
@ -998,6 +998,21 @@
#define CALLRET_LOOP() NOP
#endif
#ifndef ARM64_CHECK_PRECISION
#define ARM64_CHECK_PRECISION() \
if(dyn->need_x87check) { \
LDRH_U12(x87pc, xEmu, offsetof(x64emu_t, cw)); \
UBFXw(x87pc, x87pc, 8, 2); \
}
#endif
#ifndef X87_CHECK_PRECISION
#define X87_CHECK_PRECISION(A) \
if(dyn->need_x87check) { \
CBNZw(x87pc, 4+8); \
FCVT_S_D(A, A); \
FCVT_D_S(A, A); \
}
#endif
#define STORE_REG(A) STRx_U12(x##A, xEmu, offsetof(x64emu_t, regs[_##A]))
#define STP_REGS(A, B) STPx_S7_offset(x##A, x##B, xEmu, offsetof(x64emu_t, regs[_##A]))
#define LDP_REGS(A, B) LDPx_S7_offset(x##A, x##B, xEmu, offsetof(x64emu_t, regs[_##A]))

View File

@ -72,3 +72,6 @@
// mark opcode as "unaligned" possible only if the current address is not marked as already unaligned
#define IF_UNALIGNED(A) if((dyn->insts[ninst].unaligned=is_addr_unaligned(A)))
#define IF_ALIGNED(A) if(!(dyn->insts[ninst].unaligned=is_addr_unaligned(A)))
#define ARM64_CHECK_PRECISION()
#define X87_CHECK_PRECISION(A)

View File

@ -22,3 +22,8 @@
dyn->insts[ninst].f_exit = dyn->f
#define INST_NAME(name)
#define ARM64_CHECK_PRECISION()
#define X87_CHECK_PRECISION(A) \
if(dyn->need_x87check) \
dyn->need_x87check=2

View File

@ -171,6 +171,7 @@ typedef struct dynarec_arm_s {
uint8_t always_test;
uint8_t abort; // abort the creation of the block
void* gdbjit_block;
uint32_t need_x87check; // needs x87 precision control check if non-null, or 0 if not
} dynarec_arm_t;
void add_next(dynarec_arm_t *dyn, uintptr_t addr);

View File

@ -24,11 +24,11 @@ typedef struct dynablock_s {
uint8_t dirty; // if need to be tested as soon as it's created
uint8_t always_test:1;
uint8_t is32bits:1;
int callret_size; // size of the array
int isize;
size_t arch_size; // size of of arch dependant infos
instsize_t* instsize;
void* arch; // arch dependant per inst info (can be NULL)
size_t arch_size; // size of of arch dependant infos
int callret_size; // size of the array
callret_t* callrets; // array of callret return, with NOP / UDF depending if the block is clean or dirty
void* jmpnext; // a branch jmpnext code when block is marked
} dynablock_t;

View File

@ -35,6 +35,7 @@ extern uint32_t arm64_crc(void* p, uint32_t len);
#define ARCH_NOP 0b11010101000000110010000000011111
#define ARCH_UDF 0xcafe
#define ARCH_PRECISION() ARM64_CHECK_PRECISION()
#elif defined(LA64)
#define instruction_native_t instruction_la64_t

View File

@ -636,6 +636,11 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
CancelBlock64(0);
return NULL;
}
#ifdef ARCH_PRECISION
if(BOX64ENV(dynarec_x87double)==2) {
helper.need_x87check = 1;
}
#endif
// basic checks
if(!helper.size) {
dynarec_log(LOG_INFO, "Warning, null-sized dynarec block (%p)\n", (void*)addr);
@ -768,6 +773,12 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
CancelBlock64(0);
return NULL;
}
#ifdef ARCH_PRECISION
if(BOX64ENV(dynarec_x87double)==2) {
if(helper.need_x87check==1)
helper.need_x87check = 0;
}
#endif
// pass 2, instruction size
helper.callrets = static_callrets;
@ -796,7 +807,7 @@ void* FillBlock64(dynablock_t* block, uintptr_t addr, int alternate, int is32bit
size_t insts_rsize = (helper.insts_size+2)*sizeof(instsize_t);
insts_rsize = (insts_rsize+7)&~7; // round the size...
size_t arch_size = ARCH_SIZE(&helper);
size_t callret_size = helper.callret_size*4;
size_t callret_size = helper.callret_size*sizeof(callret_t);
// ok, now allocate mapped memory, with executable flag on
size_t sz = sizeof(void*) + native_size + helper.table64size*sizeof(uint64_t) + 4*sizeof(void*) + insts_rsize + arch_size + callret_size;
// dynablock_t* block (arm insts) table64 jmpnext code instsize arch callrets

View File

@ -83,6 +83,11 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int
break;
}
#endif
#ifdef ARCH_PRECISION
if(!ninst && dyn->need_x87check) {
ARCH_PRECISION();
}
#endif
fpu_propagate_stack(dyn, ninst);
ip = addr;
if (reset_n!=-1) {

View File

@ -442,7 +442,7 @@ uintptr_t dynarec64_D9(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
switch ((nextop >> 3) & 7) {
case 0:
INST_NAME("FLD ST0, float[ED]");
X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, BOX64ENV(dynarec_x87double) ? EXT_CACHE_ST_D : EXT_CACHE_ST_F);
X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, (BOX64ENV(dynarec_x87double)==1) ? EXT_CACHE_ST_D : EXT_CACHE_ST_F);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, 0);
FLW(v1, ed, fixedaddress);
if (!ST_IS_F(0)) {

View File

@ -59,7 +59,7 @@ extern char* ftrace_name;
BOOLEAN(BOX64_DYNAREC_TRACE, dynarec_trace, 0) \
BOOLEAN(BOX64_DYNAREC_WAIT, dynarec_wait, 1) \
BOOLEAN(BOX64_DYNAREC_WEAKBARRIER, dynarec_weakbarrier, 1) \
BOOLEAN(BOX64_DYNAREC_X87DOUBLE, dynarec_x87double, 0) \
INTEGER(BOX64_DYNAREC_X87DOUBLE, dynarec_x87double, 0, 0, 2) \
STRING(BOX64_EMULATED_LIBS, emulated_libs) \
STRING(BOX64_ENV, env) \
STRING(BOX64_ENV1, env1) \

View File

@ -538,6 +538,13 @@ BOX64_DYNAREC_BIGBLOCK=3
BOX64_DYNAREC_CALLRET=1
BOX64_SHAEXT=0 #buggy openssl version in the game
[gta3.exe]
#BOX64_DYNAREC_SAFEFLAGS=2 #not needed
BOX64_DYNAREC_DIRTY=1
BOX64_DYNAREC_BIGBLOCK=3
BOX64_DYNAREC_CALLRET=1
BOX64_DYNAREC_X87DOUBLE=2
[Hades.exe]
BOX64_DYNAREC_BIGBLOCK=3
BOX64_DYNAREC_CALLRET=1