mirror of
https://github.com/ptitSeb/box64.git
synced 2025-05-09 00:21:32 +08:00
[RV64_DYNAREC] Added preliminary RVV infra and PXOR opcode for demonstration (#1632)
* [RV64_DYNAREC] Added preliminary RVV infra and PXOR opcode for demonstration * keep sse_cache_s uint8_t as suggested * use xor to do the wrap * revert * better fallback
This commit is contained in:
parent
b5946f3752
commit
eb695d5553
7
.github/workflows/release.yml
vendored
7
.github/workflows/release.yml
vendored
@ -146,9 +146,10 @@ jobs:
|
||||
cd build
|
||||
if [[ ${{ matrix.platform }} == 'RISCV' ]]; then
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ ctest -j$(nproc) --output-on-failure
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ BOX64_DYNAREC=0 ctest -j$(nproc) --output-on-failure
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,zba=true,zbb=true,zbc=true,zbs=true ctest -j$(nproc) --output-on-failure
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,xtheadba=true,xtheadba=true,xtheadbb=true,xtheadbs=true,xtheadcondmov=true,xtheadmemidx=true,xtheadmempair=true,xtheadfmemidx=true,xtheadmac=true,xtheadfmv=true ctest -j$(nproc) --output-on-failure
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=false BOX64_DYNAREC=0 ctest -j$(nproc) --output-on-failure
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=false,zba=true,zbb=true,zbc=true,zbs=true ctest -j$(nproc) --output-on-failure
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=true,vlen=128,vext_spec=v1.0 ctest -j$(nproc) --output-on-failure
|
||||
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=false,xtheadba=true,xtheadba=true,xtheadbb=true,xtheadbs=true,xtheadcondmov=true,xtheadmemidx=true,xtheadmempair=true,xtheadfmemidx=true,xtheadmac=true,xtheadfmv=true ctest -j$(nproc) --output-on-failure
|
||||
elif [[ ${{ matrix.platform }} == 'LARCH64' ]]; then
|
||||
INTERPRETER=qemu-loongarch64-static QEMU_LD_PREFIX=/usr/loongarch64-linux-gnu/ BOX64_DYNAREC_LA64NOEXT=1 ctest -j$(nproc) --repeat until-pass:20 --output-on-failure
|
||||
INTERPRETER=qemu-loongarch64-static QEMU_LD_PREFIX=/usr/loongarch64-linux-gnu/ BOX64_DYNAREC_TEST=2 BOX64_DYNAREC_LA64NOEXT=1 ctest -j$(nproc) --repeat until-pass:20 --output-on-failure
|
||||
|
@ -821,6 +821,7 @@ if(RV64_DYNAREC)
|
||||
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_df.c"
|
||||
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_f0.c"
|
||||
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_660f.c"
|
||||
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_660f_vector.c"
|
||||
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66f20f.c"
|
||||
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66f30f.c"
|
||||
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_6664.c"
|
||||
|
@ -786,7 +786,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
|
||||
nextop = F8;
|
||||
if (MODREG) {
|
||||
ed = (nextop & 7) + (rex.b << 3);
|
||||
sse_reflect_reg(dyn, ninst, ed);
|
||||
sse_reflect_reg(dyn, ninst, x6, ed);
|
||||
ADDI(x2, xEmu, offsetof(x64emu_t, xmm[ed]));
|
||||
} else {
|
||||
SMREAD();
|
||||
@ -796,9 +796,9 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
|
||||
}
|
||||
}
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
ADDI(x1, xEmu, offsetof(x64emu_t, xmm[gd]));
|
||||
sse_reflect_reg(dyn, ninst, 0);
|
||||
sse_reflect_reg(dyn, ninst, x6, 0);
|
||||
switch (u8) {
|
||||
case 0xC8:
|
||||
CALL(sha1nexte, -1);
|
||||
@ -878,7 +878,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
|
||||
nextop = F8;
|
||||
if (MODREG) {
|
||||
ed = (nextop & 7) + (rex.b << 3);
|
||||
sse_reflect_reg(dyn, ninst, ed);
|
||||
sse_reflect_reg(dyn, ninst, x6, ed);
|
||||
ADDI(x2, xEmu, offsetof(x64emu_t, xmm[ed]));
|
||||
} else {
|
||||
SMREAD();
|
||||
@ -887,7 +887,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
|
||||
}
|
||||
u8 = F8;
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
ADDI(x1, xEmu, offsetof(x64emu_t, xmm[gd]));
|
||||
MOV32w(x3, u8);
|
||||
CALL(sha1rnds4, -1);
|
||||
|
@ -38,6 +38,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
|
||||
int64_t fixedaddress;
|
||||
int unscaled;
|
||||
int lock;
|
||||
uintptr_t retaddr = 0;
|
||||
MAYUSE(u8);
|
||||
MAYUSE(u16);
|
||||
MAYUSE(u64);
|
||||
@ -125,7 +126,14 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
|
||||
break;
|
||||
case 0x0F:
|
||||
switch(rep) {
|
||||
case 0: addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;
|
||||
case 0: {
|
||||
if (rv64_vector) {
|
||||
retaddr = dynarec64_660F_vector(dyn, addr, ip, ninst, rex, ok, need_epilog);
|
||||
addr = retaddr ? retaddr : dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
|
||||
} else
|
||||
addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
|
||||
break;
|
||||
}
|
||||
case 1: addr = dynarec64_66F20F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;
|
||||
case 2: addr = dynarec64_66F30F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;
|
||||
}
|
||||
|
@ -511,7 +511,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
nextop = F8;
|
||||
GETGX();
|
||||
GETEX(x2, 0);
|
||||
sse_forget_reg(dyn, ninst, 0); // forget xmm[0]
|
||||
sse_forget_reg(dyn, ninst, x6, 0); // forget xmm[0]
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
LB(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i);
|
||||
BGE(x3, xZR, 12); // continue
|
||||
@ -920,11 +920,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
SETFLAGS(X_ALL, SF_SET_DF);
|
||||
nextop = F8;
|
||||
GETG;
|
||||
sse_reflect_reg(dyn, ninst, gd);
|
||||
sse_reflect_reg(dyn, ninst, x6, gd);
|
||||
ADDI(x3, xEmu, offsetof(x64emu_t, xmm[gd]));
|
||||
if (MODREG) {
|
||||
ed = (nextop & 7) + (rex.b << 3);
|
||||
sse_reflect_reg(dyn, ninst, ed);
|
||||
sse_reflect_reg(dyn, ninst, x6, ed);
|
||||
ADDI(x1, xEmu, offsetof(x64emu_t, xmm[ed]));
|
||||
} else {
|
||||
addr = geted(dyn, addr, ninst, nextop, &wback, x1, x2, &fixedaddress, rex, NULL, 0, 1);
|
||||
@ -957,7 +957,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
GETGX();
|
||||
GETEX(x2, 0);
|
||||
SSE_LOOP_MV_Q(x3);
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
MOV32w(x1, gd);
|
||||
CALL(native_aesimc, -1);
|
||||
break;
|
||||
@ -965,7 +965,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
INST_NAME("AESENC Gx, Ex"); // AES-NI
|
||||
nextop = F8;
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
MOV32w(x1, gd);
|
||||
CALL(native_aese, -1);
|
||||
GETGX();
|
||||
@ -976,7 +976,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
INST_NAME("AESENCLAST Gx, Ex"); // AES-NI
|
||||
nextop = F8;
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
MOV32w(x1, gd);
|
||||
CALL(native_aeselast, -1);
|
||||
GETGX();
|
||||
@ -987,7 +987,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
INST_NAME("AESDEC Gx, Ex"); // AES-NI
|
||||
nextop = F8;
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
MOV32w(x1, gd);
|
||||
CALL(native_aesd, -1);
|
||||
GETGX();
|
||||
@ -999,7 +999,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
INST_NAME("AESDECLAST Gx, Ex"); // AES-NI
|
||||
nextop = F8;
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
MOV32w(x1, gd);
|
||||
CALL(native_aesdlast, -1);
|
||||
GETGX();
|
||||
@ -1333,11 +1333,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
INST_NAME("PCLMULQDQ Gx, Ex, Ib");
|
||||
nextop = F8;
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
MOV32w(x1, gd); // gx
|
||||
if (MODREG) {
|
||||
ed = (nextop & 7) + (rex.b << 3);
|
||||
sse_forget_reg(dyn, ninst, ed);
|
||||
sse_forget_reg(dyn, ninst, x6, ed);
|
||||
MOV32w(x2, ed);
|
||||
MOV32w(x3, 0); // p = NULL
|
||||
} else {
|
||||
@ -1355,11 +1355,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI
|
||||
nextop = F8;
|
||||
GETG;
|
||||
sse_forget_reg(dyn, ninst, gd);
|
||||
sse_forget_reg(dyn, ninst, x6, gd);
|
||||
MOV32w(x1, gd); // gx
|
||||
if (MODREG) {
|
||||
ed = (nextop & 7) + (rex.b << 3);
|
||||
sse_forget_reg(dyn, ninst, ed);
|
||||
sse_forget_reg(dyn, ninst, x6, ed);
|
||||
MOV32w(x2, ed);
|
||||
MOV32w(x3, 0); // p = NULL
|
||||
} else {
|
||||
|
70
src/dynarec/rv64/dynarec_rv64_660f_vector.c
Normal file
70
src/dynarec/rv64/dynarec_rv64_660f_vector.c
Normal file
@ -0,0 +1,70 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
#include <errno.h>
|
||||
#include "debug.h"
|
||||
#include "box64context.h"
|
||||
#include "dynarec.h"
|
||||
#include "emu/x64emu_private.h"
|
||||
#include "emu/x64run_private.h"
|
||||
#include "x64run.h"
|
||||
#include "x64emu.h"
|
||||
#include "box64stack.h"
|
||||
#include "callback.h"
|
||||
#include "emu/x64run_private.h"
|
||||
#include "x64trace.h"
|
||||
#include "dynarec_native.h"
|
||||
#include "bitutils.h"
|
||||
#include "rv64_printer.h"
|
||||
#include "dynarec_rv64_private.h"
|
||||
#include "dynarec_rv64_functions.h"
|
||||
#include "dynarec_rv64_helper.h"
|
||||
|
||||
uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)
|
||||
{
|
||||
(void)ip;
|
||||
(void)need_epilog;
|
||||
uint8_t opcode = F8;
|
||||
uint8_t nextop, u8, s8;
|
||||
int32_t i32;
|
||||
uint8_t gd, ed;
|
||||
uint8_t wback, wb1, wb2, gback;
|
||||
uint8_t eb1, eb2;
|
||||
int64_t j64;
|
||||
uint64_t tmp64u, tmp64u2;
|
||||
int v0, v1;
|
||||
int q0, q1;
|
||||
int d0, d1, d2;
|
||||
int64_t fixedaddress, gdoffset;
|
||||
int unscaled;
|
||||
MAYUSE(d0);
|
||||
MAYUSE(d1);
|
||||
MAYUSE(q0);
|
||||
MAYUSE(q1);
|
||||
MAYUSE(eb1);
|
||||
MAYUSE(eb2);
|
||||
MAYUSE(j64);
|
||||
switch (opcode) {
|
||||
case 0xEF:
|
||||
INST_NAME("PXOR Gx, Ex");
|
||||
nextop = F8;
|
||||
// FIXME: we should try to minimize vsetvl usage as it may hurts performance a lot.
|
||||
vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8);
|
||||
|
||||
GETG;
|
||||
if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
|
||||
// special case
|
||||
q0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
|
||||
VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
|
||||
} else {
|
||||
q0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1);
|
||||
GETEX_vector(q1, 0, 0);
|
||||
VXOR_VV(q0, q0, q1, VECTOR_UNMASKED);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// fallback to the scalar version
|
||||
return 0;
|
||||
}
|
||||
return addr;
|
||||
}
|
@ -402,13 +402,16 @@ void extcacheUnwind(extcache_t* cache)
|
||||
}
|
||||
// add/change bad regs
|
||||
for(int i=0; i<16; ++i) {
|
||||
if(cache->olds[i].changed) {
|
||||
cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD;
|
||||
} else if(cache->olds[i].purged) {
|
||||
cache->extcache[i].n = i;
|
||||
cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD;
|
||||
if (cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_SD) {
|
||||
if (cache->olds[i].changed) {
|
||||
cache->extcache[i].t = cache->olds[i].single ? EXT_CACHE_SS : EXT_CACHE_SD;
|
||||
} else if (cache->olds[i].purged) {
|
||||
cache->extcache[i].n = i;
|
||||
cache->extcache[i].t = cache->olds[i].single ? EXT_CACHE_SS : EXT_CACHE_SD;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(cache->stack_push) {
|
||||
// unpush
|
||||
for(int j=0; j<24; ++j) {
|
||||
@ -465,14 +468,23 @@ void extcacheUnwind(extcache_t* cache)
|
||||
break;
|
||||
case EXT_CACHE_SS:
|
||||
cache->ssecache[cache->extcache[i].n].reg = EXTREG(i);
|
||||
cache->ssecache[cache->extcache[i].n].vector = 0;
|
||||
cache->ssecache[cache->extcache[i].n].single = 1;
|
||||
++cache->fpu_reg;
|
||||
break;
|
||||
case EXT_CACHE_SD:
|
||||
cache->ssecache[cache->extcache[i].n].reg = EXTREG(i);
|
||||
cache->ssecache[cache->extcache[i].n].vector = 0;
|
||||
cache->ssecache[cache->extcache[i].n].single = 0;
|
||||
++cache->fpu_reg;
|
||||
break;
|
||||
case EXT_CACHE_XMMR:
|
||||
case EXT_CACHE_XMMW:
|
||||
cache->ssecache[cache->extcache[i].n].reg = i;
|
||||
cache->ssecache[cache->extcache[i].n].vector = 1;
|
||||
cache->ssecache[cache->extcache[i].n].write = (cache->extcache[i].t == EXT_CACHE_XMMW) ? 1 : 0;
|
||||
++cache->fpu_reg;
|
||||
break;
|
||||
case EXT_CACHE_ST_F:
|
||||
case EXT_CACHE_ST_D:
|
||||
case EXT_CACHE_ST_I64:
|
||||
@ -556,6 +568,8 @@ const char* getCacheName(int t, int n)
|
||||
case EXT_CACHE_SS: sprintf(buff, "SS%d", n); break;
|
||||
case EXT_CACHE_SD: sprintf(buff, "SD%d", n); break;
|
||||
case EXT_CACHE_SCR: sprintf(buff, "Scratch"); break;
|
||||
case EXT_CACHE_XMMW: sprintf(buff, "XMM%d", n); break;
|
||||
case EXT_CACHE_XMMR: sprintf(buff, "xmm%d", n); break;
|
||||
case EXT_CACHE_NONE: buff[0]='\0'; break;
|
||||
}
|
||||
return buff;
|
||||
@ -570,6 +584,12 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
|
||||
"fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fs8", "fs9", "fs10", "fs11",
|
||||
"ft8", "ft9", "ft10", "ft11"
|
||||
};
|
||||
static const char* vnames[] = {
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
|
||||
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v24", "v25", "v26", "v27", "v8", "v9", "v30", "v31",
|
||||
};
|
||||
if(box64_dynarec_dump) {
|
||||
printf_x64_instruction(rex.is32bits?my_context->dec32:my_context->dec, &dyn->insts[ninst].x64, name);
|
||||
dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d",
|
||||
@ -607,6 +627,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
|
||||
case EXT_CACHE_MM: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
|
||||
case EXT_CACHE_SS: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
|
||||
case EXT_CACHE_SD: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
|
||||
case EXT_CACHE_XMMR: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
|
||||
case EXT_CACHE_XMMW: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
|
||||
case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
|
||||
case EXT_CACHE_NONE:
|
||||
default: break;
|
||||
@ -689,4 +711,4 @@ void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst)
|
||||
int fpu_is_st_freed(dynarec_rv64_t* dyn, int ninst, int st)
|
||||
{
|
||||
return (dyn->e.tags&(0b11<<(st*2)))?1:0;
|
||||
}
|
||||
}
|
||||
|
@ -1570,10 +1570,15 @@ static void mmx_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
|
||||
// get ext register for a SSE reg, create the entry if needed
|
||||
int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
|
||||
{
|
||||
if(dyn->e.ssecache[a].v!=-1) {
|
||||
if (dyn->e.ssecache[a].v != -1) {
|
||||
if (dyn->e.ssecache[a].vector == 1) {
|
||||
// it's in the fpu, forget it first...
|
||||
sse_forget_reg_vector(dyn, ninst, s1, a);
|
||||
return sse_get_reg(dyn, ninst, s1, a, single);
|
||||
}
|
||||
// forget / reload if change of size
|
||||
if(dyn->e.ssecache[a].single!=single) {
|
||||
sse_forget_reg(dyn, ninst, a);
|
||||
sse_forget_reg(dyn, ninst, s1, a);
|
||||
// update olds after the forget...
|
||||
dyn->e.olds[a].changed = 1;
|
||||
dyn->e.olds[a].purged = 0;
|
||||
@ -1585,41 +1590,52 @@ int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
|
||||
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a);
|
||||
int ret = dyn->e.ssecache[a].reg;
|
||||
dyn->e.ssecache[a].single = single;
|
||||
dyn->e.ssecache[a].vector = 0;
|
||||
if(dyn->e.ssecache[a].single)
|
||||
FLW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
else
|
||||
FLD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// get ext register for a SSE reg, but don't try to synch it if it needed to be created
|
||||
int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
|
||||
{
|
||||
if(dyn->e.ssecache[a].v!=-1) {
|
||||
if(dyn->e.ssecache[a].single!=single) {
|
||||
if (dyn->e.ssecache[a].v != -1) {
|
||||
if (dyn->e.ssecache[a].vector == 1) {
|
||||
// it's in the fpu, forget it first...
|
||||
sse_forget_reg_vector(dyn, ninst, s1, a);
|
||||
return sse_get_reg_empty(dyn, ninst, s1, a, single);
|
||||
}
|
||||
|
||||
if (dyn->e.ssecache[a].single != single) {
|
||||
if (single) {
|
||||
// writing back the double
|
||||
// writing back the double, to clear upper 32 bit.
|
||||
FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
// need to wipe the half high 32bits of old Double because we now have a single
|
||||
//SW(xZR, xEmu, offsetof(x64emu_t, xmm[a])+4);
|
||||
}
|
||||
dyn->e.olds[a].changed = 1;
|
||||
dyn->e.olds[a].purged = 0;
|
||||
dyn->e.olds[a].reg = EXTIDX(dyn->e.ssecache[a].reg);
|
||||
dyn->e.olds[a].single = 1-single;
|
||||
dyn->e.ssecache[a].single = single;
|
||||
dyn->e.ssecache[a].vector = 0;
|
||||
dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = single?EXT_CACHE_SS:EXT_CACHE_SD;
|
||||
}
|
||||
return dyn->e.ssecache[a].reg;
|
||||
}
|
||||
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a);
|
||||
dyn->e.ssecache[a].single = single;
|
||||
dyn->e.ssecache[a].vector = 0;
|
||||
return dyn->e.ssecache[a].reg;
|
||||
}
|
||||
|
||||
// forget ext register for a SSE reg, does nothing if the regs is not loaded
|
||||
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a)
|
||||
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a)
|
||||
{
|
||||
if(dyn->e.ssecache[a].v==-1)
|
||||
if (dyn->e.ssecache[a].v == -1)
|
||||
return;
|
||||
if (dyn->e.ssecache[a].vector == 1)
|
||||
return sse_forget_reg_vector(dyn, ninst, s1, a);
|
||||
if(dyn->e.ssecache[a].single)
|
||||
FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
else
|
||||
@ -1632,24 +1648,93 @@ void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a)
|
||||
dyn->e.ssecache[a].v = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
// get rvv register for a SSE reg, create the entry if needed
|
||||
int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwrite)
|
||||
{
|
||||
if (dyn->e.ssecache[a].v != -1) {
|
||||
if (dyn->e.ssecache[a].vector == 0) {
|
||||
// it's in the fpu, forget it first...
|
||||
sse_forget_reg(dyn, ninst, s1, a);
|
||||
return sse_get_reg_vector(dyn, ninst, s1, a, forwrite);
|
||||
}
|
||||
|
||||
if (forwrite) {
|
||||
dyn->e.ssecache[a].write = 1; // update only if forwrite
|
||||
dyn->e.ssecache[a].single = 0; // just to be clean
|
||||
dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = EXT_CACHE_XMMW;
|
||||
}
|
||||
return dyn->e.ssecache[a].reg;
|
||||
}
|
||||
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, forwrite ? EXT_CACHE_XMMW : EXT_CACHE_XMMR, a);
|
||||
int ret = dyn->e.ssecache[a].reg;
|
||||
dyn->e.ssecache[a].write = forwrite;
|
||||
dyn->e.ssecache[a].vector = 1;
|
||||
dyn->e.ssecache[a].single = 0; // just to be clean
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
VLE8_V(ret, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// get rvv register for an SSE reg, but don't try to synch it if it needed to be created
|
||||
int sse_get_reg_empty_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a)
|
||||
{
|
||||
if (dyn->e.ssecache[a].v != -1) {
|
||||
if (dyn->e.ssecache[a].vector == 0) {
|
||||
// it's in the fpu, forget it first...
|
||||
sse_forget_reg(dyn, ninst, s1, a);
|
||||
return sse_get_reg_empty_vector(dyn, ninst, s1, a);
|
||||
}
|
||||
dyn->e.ssecache[a].vector = 1;
|
||||
dyn->e.ssecache[a].write = 1;
|
||||
dyn->e.ssecache[a].single = 0; // just to be clean
|
||||
dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = EXT_CACHE_XMMW;
|
||||
return dyn->e.ssecache[a].reg;
|
||||
}
|
||||
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, EXT_CACHE_XMMW, a);
|
||||
dyn->e.ssecache[a].vector = 1;
|
||||
dyn->e.ssecache[a].single = 0; // just to be clean
|
||||
dyn->e.ssecache[a].write = 1; // it will be write...
|
||||
return dyn->e.ssecache[a].reg;
|
||||
}
|
||||
|
||||
// forget rvv register for a SSE reg, does nothing if the regs is not loaded
|
||||
void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a)
|
||||
{
|
||||
if (dyn->e.ssecache[a].v == -1)
|
||||
return;
|
||||
if (dyn->e.ssecache[a].vector == 0)
|
||||
return sse_forget_reg(dyn, ninst, s1, a);
|
||||
if (dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t == EXT_CACHE_XMMW) {
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
}
|
||||
fpu_free_reg(dyn, dyn->e.ssecache[a].reg);
|
||||
dyn->e.ssecache[a].v = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
// purge the SSE cache for XMM0..XMM7 (to use before function native call)
|
||||
void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1)
|
||||
{
|
||||
int old = -1;
|
||||
for (int i=0; i<8; ++i)
|
||||
if(dyn->e.ssecache[i].v!=-1) {
|
||||
if (old==-1) {
|
||||
for (int i = 0; i < 8; ++i)
|
||||
if (dyn->e.ssecache[i].v != -1) {
|
||||
if (old == -1) {
|
||||
MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n");
|
||||
++old;
|
||||
}
|
||||
if(dyn->e.ssecache[i].single)
|
||||
if (dyn->e.ssecache[i].vector) {
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
} else if (dyn->e.ssecache[i].single)
|
||||
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
else
|
||||
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
|
||||
dyn->e.ssecache[i].v = -1;
|
||||
}
|
||||
if(old!=-1) {
|
||||
if (old != -1) {
|
||||
MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n");
|
||||
}
|
||||
}
|
||||
@ -1664,17 +1749,25 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
|
||||
MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":"");
|
||||
++old;
|
||||
}
|
||||
if(dyn->e.ssecache[i].single)
|
||||
if (dyn->e.ssecache[i].vector) {
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
} else if (dyn->e.ssecache[i].single)
|
||||
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
else
|
||||
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
if(!next) {
|
||||
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
|
||||
dyn->e.olds[i].changed = 0;
|
||||
dyn->e.olds[i].purged = 1;
|
||||
dyn->e.olds[i].reg = dyn->e.ssecache[i].reg;
|
||||
dyn->e.olds[i].single = dyn->e.ssecache[i].single;
|
||||
dyn->e.ssecache[i].v = -1;
|
||||
if (dyn->e.ssecache[i].vector) {
|
||||
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
|
||||
dyn->e.ssecache[i].v = -1;
|
||||
} else {
|
||||
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
|
||||
dyn->e.olds[i].changed = 0;
|
||||
dyn->e.olds[i].purged = 1;
|
||||
dyn->e.olds[i].reg = dyn->e.ssecache[i].reg;
|
||||
dyn->e.olds[i].single = dyn->e.ssecache[i].single;
|
||||
dyn->e.ssecache[i].v = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(old!=-1) {
|
||||
@ -1684,20 +1777,26 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
|
||||
|
||||
static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
|
||||
{
|
||||
for (int i=0; i<16; ++i)
|
||||
if(dyn->e.ssecache[i].v!=-1) {
|
||||
if(dyn->e.ssecache[i].single)
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if (dyn->e.ssecache[i].v != -1) {
|
||||
if (dyn->e.ssecache[i].vector) {
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
} else if (dyn->e.ssecache[i].single)
|
||||
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
else
|
||||
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
}
|
||||
}
|
||||
|
||||
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int a)
|
||||
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a)
|
||||
{
|
||||
if (dyn->e.ssecache[a].v == -1)
|
||||
return;
|
||||
if (dyn->e.ssecache[a].single)
|
||||
if (dyn->e.ssecache[a].vector) {
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
} else if (dyn->e.ssecache[a].single)
|
||||
FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
else
|
||||
FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
|
||||
@ -1717,7 +1816,10 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
|
||||
MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
|
||||
for (int i=start; i<8; ++i)
|
||||
if(dyn->e.ssecache[i].v!=-1) {
|
||||
if(dyn->e.ssecache[i].single)
|
||||
if (dyn->e.ssecache[i].vector) {
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
} else if (dyn->e.ssecache[i].single)
|
||||
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
else
|
||||
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
@ -1760,7 +1862,10 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
|
||||
MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n);
|
||||
for (int i=start; i<8; ++i)
|
||||
if(dyn->e.ssecache[i].v!=-1) {
|
||||
if(dyn->e.ssecache[i].single)
|
||||
if (dyn->e.ssecache[i].vector) {
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
VLE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
} else if (dyn->e.ssecache[i].single)
|
||||
FLW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
else
|
||||
FLD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
|
||||
@ -1829,6 +1934,14 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_
|
||||
if (t == EXT_CACHE_ST_D)
|
||||
return i;
|
||||
break;
|
||||
case EXT_CACHE_XMMR:
|
||||
if (t == EXT_CACHE_XMMW)
|
||||
return i;
|
||||
break;
|
||||
case EXT_CACHE_XMMW:
|
||||
if (t == EXT_CACHE_XMMR)
|
||||
return i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1837,16 +1950,33 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_
|
||||
|
||||
static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *cache)
|
||||
{
|
||||
if (i==j)
|
||||
if (i == j) return;
|
||||
|
||||
if (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW || cache->extcache[j].t == EXT_CACHE_XMMR || cache->extcache[j].t == EXT_CACHE_XMMW) {
|
||||
if (!cache->extcache[i].v) {
|
||||
// a mov is enough, no need to swap
|
||||
MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j);
|
||||
VOR_VV(i, j, j, VECTOR_UNMASKED);
|
||||
cache->extcache[i].v = cache->extcache[j].v;
|
||||
cache->extcache[j].v = 0;
|
||||
return;
|
||||
}
|
||||
// SWAP
|
||||
ext_cache_t tmp;
|
||||
MESSAGE(LOG_DUMP, "\t - Swapping %d <-> %d\n", i, j);
|
||||
VXOR_VV(i, i, j, VECTOR_UNMASKED);
|
||||
VXOR_VV(j, i, j, VECTOR_UNMASKED);
|
||||
VXOR_VV(i, i, j, VECTOR_UNMASKED);
|
||||
tmp.v = cache->extcache[i].v;
|
||||
cache->extcache[i].v = cache->extcache[j].v;
|
||||
cache->extcache[j].v = tmp.v;
|
||||
return;
|
||||
}
|
||||
|
||||
int reg_i = EXTREG(i);
|
||||
int reg_j = EXTREG(j);
|
||||
int i_single = 0;
|
||||
if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F)
|
||||
i_single =1;
|
||||
int j_single = 0;
|
||||
if(cache->extcache[j].t==EXT_CACHE_SS || cache->extcache[j].t==EXT_CACHE_ST_F)
|
||||
j_single =1;
|
||||
int i_single = cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_ST_F;
|
||||
int j_single = cache->extcache[j].t == EXT_CACHE_SS || cache->extcache[j].t == EXT_CACHE_ST_F;
|
||||
|
||||
if(!cache->extcache[i].v) {
|
||||
// a mov is enough, no need to swap
|
||||
@ -1887,17 +2017,22 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *
|
||||
static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t* cache, int i, int t, int n)
|
||||
{
|
||||
int reg = EXTREG(i);
|
||||
if(cache->extcache[i].v) {
|
||||
int single = 0;
|
||||
if(t==EXT_CACHE_SS || t==EXT_CACHE_ST_F)
|
||||
single = 1;
|
||||
if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F)
|
||||
single = 1;
|
||||
int j = i+1;
|
||||
while(cache->extcache[j].v)
|
||||
++j;
|
||||
if (cache->extcache[i].v && (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW)) {
|
||||
int j = i + 1;
|
||||
while (cache->extcache[j].v) ++j;
|
||||
MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i);
|
||||
if(single) {
|
||||
VOR_VV(j, i, i, VECTOR_UNMASKED);
|
||||
cache->extcache[j].v = cache->extcache[i].v;
|
||||
} else if (cache->extcache[i].v) {
|
||||
int single = 0;
|
||||
if (t == EXT_CACHE_SS || t == EXT_CACHE_ST_F)
|
||||
single = 1;
|
||||
if (cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_ST_F)
|
||||
single = 1;
|
||||
int j = i + 1;
|
||||
while (cache->extcache[j].v) ++j;
|
||||
MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i);
|
||||
if (single) {
|
||||
FMVS(EXTREG(j), reg);
|
||||
} else {
|
||||
FMVD(EXTREG(j), reg);
|
||||
@ -1905,6 +2040,12 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int
|
||||
cache->extcache[j].v = cache->extcache[i].v;
|
||||
}
|
||||
switch(t) {
|
||||
case EXT_CACHE_XMMR:
|
||||
case EXT_CACHE_XMMW:
|
||||
MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n));
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
|
||||
VLE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
break;
|
||||
case EXT_CACHE_SS:
|
||||
MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n));
|
||||
FLW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
|
||||
@ -1956,6 +2097,14 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i
|
||||
{
|
||||
int reg = EXTREG(i);
|
||||
switch(t) {
|
||||
case EXT_CACHE_XMMR:
|
||||
MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n));
|
||||
break;
|
||||
case EXT_CACHE_XMMW:
|
||||
MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n));
|
||||
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
|
||||
VSE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
|
||||
break;
|
||||
case EXT_CACHE_SS:
|
||||
MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n));
|
||||
FSW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
|
||||
@ -2045,43 +2194,47 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in
|
||||
int s2_val = 0;
|
||||
// unload every uneeded cache
|
||||
// check SSE first, than MMX, in order, for optimisation issue
|
||||
for(int i=0; i<16; ++i) {
|
||||
int j=findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache);
|
||||
if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2)==-1)
|
||||
if (rv64_vector) vector_vsetvl_emul1(dyn, ninst, s1, VECTOR_SEW8);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
int j = findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache);
|
||||
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2) == -1)
|
||||
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
|
||||
j=findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache);
|
||||
if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2)==-1)
|
||||
j = findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache);
|
||||
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2) == -1)
|
||||
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
|
||||
j = findCacheSlot(dyn, ninst, EXT_CACHE_XMMW, i, &cache);
|
||||
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_XMMW, i, &cache_i2) == -1)
|
||||
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
|
||||
}
|
||||
for(int i=0; i<8; ++i) {
|
||||
int j=findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache);
|
||||
if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2)==-1)
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
int j = findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache);
|
||||
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2) == -1)
|
||||
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
|
||||
}
|
||||
for(int i=0; i<24; ++i) {
|
||||
for (int i = 0; i < 24; ++i) {
|
||||
if(cache.extcache[i].v)
|
||||
if(findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2)==-1)
|
||||
if (findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2) == -1)
|
||||
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.extcache[i].t, cache.extcache[i].n);
|
||||
}
|
||||
// and now load/swap the missing one
|
||||
for(int i=0; i<24; ++i) {
|
||||
if(cache_i2.extcache[i].v) {
|
||||
if(cache_i2.extcache[i].v != cache.extcache[i].v) {
|
||||
for (int i = 0; i < 24; ++i) {
|
||||
if (cache_i2.extcache[i].v) {
|
||||
if (cache_i2.extcache[i].v != cache.extcache[i].v) {
|
||||
int j;
|
||||
if((j=findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache))==-1)
|
||||
if ((j = findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache)) == -1)
|
||||
loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.extcache[i].t, cache_i2.extcache[i].n);
|
||||
else {
|
||||
// it's here, lets swap if needed
|
||||
if(j!=i)
|
||||
if (j != i)
|
||||
swapCache(dyn, ninst, i, j, &cache);
|
||||
}
|
||||
}
|
||||
if(cache.extcache[i].t != cache_i2.extcache[i].t) {
|
||||
if(cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) {
|
||||
if (cache.extcache[i].t != cache_i2.extcache[i].t) {
|
||||
if (cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) {
|
||||
MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
|
||||
FCVTSD(EXTREG(i), EXTREG(i));
|
||||
cache.extcache[i].t = EXT_CACHE_ST_F;
|
||||
} else if(cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) {
|
||||
} else if (cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) {
|
||||
MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
|
||||
FCVTDS(EXTREG(i), EXTREG(i));
|
||||
cache.extcache[i].t = EXT_CACHE_ST_D;
|
||||
@ -2331,3 +2484,18 @@ void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst)
|
||||
dyn->e.stack_push = 0;
|
||||
dyn->e.swapped = 0;
|
||||
}
|
||||
|
||||
// Use vector extension as like SIMD for now, this function sets the specified element width,
|
||||
// other configs are set automatically.
|
||||
void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew)
|
||||
{
|
||||
/* mu: mask undisturbed
|
||||
* tu: tail undisturbed
|
||||
* sew: selected element width
|
||||
* lmul: vector register group multiplier
|
||||
*
|
||||
* mu tu sew lmul=1 */
|
||||
uint32_t vtypei = (0b0 << 7) | (0b0 << 6) | (sew << 3) | 0b000;
|
||||
ADDI(s1, xZR, 16 >> sew);
|
||||
VSETVLI(xZR, s1, vtypei);
|
||||
}
|
||||
|
@ -473,10 +473,10 @@
|
||||
FLD(a, ed, fixedaddress); \
|
||||
}
|
||||
|
||||
// Will get pointer to GX in general register a, will purge SS or SD if loaded. can use gback as load address
|
||||
// Will get pointer to GX in general register a, will purge SS or SD if loaded. May use x3. can use gback as load address
|
||||
#define GETGX() \
|
||||
gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
|
||||
sse_forget_reg(dyn, ninst, gd); \
|
||||
sse_forget_reg(dyn, ninst, x3, gd); \
|
||||
gback = xEmu; \
|
||||
gdoffset = offsetof(x64emu_t, xmm[gd])
|
||||
|
||||
@ -484,7 +484,7 @@
|
||||
#define GETEX(a, D) \
|
||||
if (MODREG) { \
|
||||
ed = (nextop & 7) + (rex.b << 3); \
|
||||
sse_forget_reg(dyn, ninst, ed); \
|
||||
sse_forget_reg(dyn, ninst, x3, ed); \
|
||||
fixedaddress = offsetof(x64emu_t, xmm[ed]); \
|
||||
wback = xEmu; \
|
||||
} else { \
|
||||
@ -494,6 +494,18 @@
|
||||
fixedaddress = 0; /* TODO: optimize this! */ \
|
||||
}
|
||||
|
||||
// Get EX as a quad, (x1 is used)
|
||||
#define GETEX_vector(a, w, D) \
|
||||
if (MODREG) { \
|
||||
a = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w); \
|
||||
} else { \
|
||||
SMREAD(); \
|
||||
addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, D); \
|
||||
a = fpu_get_scratch(dyn); \
|
||||
ADDI(x2, ed, fixedaddress); \
|
||||
VLE8_V(a, x2, VECTOR_UNMASKED, VECTOR_NFIELD1); \
|
||||
}
|
||||
|
||||
#define GETGM() \
|
||||
gd = ((nextop & 0x38) >> 3); \
|
||||
mmx_forget_reg(dyn, ninst, gd); \
|
||||
@ -1093,6 +1105,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
|
||||
#define dynarec64_F20F STEPNAME(dynarec64_F20F)
|
||||
#define dynarec64_F30F STEPNAME(dynarec64_F30F)
|
||||
|
||||
#define dynarec64_660F_vector STEPNAME(dynarec64_660F_vector)
|
||||
|
||||
#define geted STEPNAME(geted)
|
||||
#define geted32 STEPNAME(geted32)
|
||||
#define geted16 STEPNAME(geted16)
|
||||
@ -1223,6 +1237,10 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
|
||||
#define sse_purge07cache STEPNAME(sse_purge07cache)
|
||||
#define sse_reflect_reg STEPNAME(sse_reflect_reg)
|
||||
|
||||
#define sse_get_reg_empty_vector STEPNAME(sse_get_reg_empty_vector)
|
||||
#define sse_get_reg_vector STEPNAME(sse_get_reg_vector)
|
||||
#define sse_forget_reg_vector STEPNAME(sse_forget_reg_vector)
|
||||
|
||||
#define fpu_pushcache STEPNAME(fpu_pushcache)
|
||||
#define fpu_popcache STEPNAME(fpu_popcache)
|
||||
#define fpu_reset_cache STEPNAME(fpu_reset_cache)
|
||||
@ -1238,6 +1256,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
|
||||
#define rv64_move64 STEPNAME(rv64_move64)
|
||||
#define rv64_move32 STEPNAME(rv64_move32)
|
||||
|
||||
#define vector_vsetvl_emul1 STEPNAME(vector_vsetvl_emul1)
|
||||
|
||||
/* setup r2 to address pointed by */
|
||||
uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
|
||||
|
||||
@ -1392,6 +1412,8 @@ void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2
|
||||
void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val);
|
||||
void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zeroup);
|
||||
|
||||
void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew);
|
||||
|
||||
#if STEP < 2
|
||||
#define CHECK_CACHE() 0
|
||||
#else
|
||||
@ -1435,14 +1457,20 @@ void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
|
||||
// SSE/SSE2 helpers
|
||||
// get float register for a SSE reg, create the entry if needed
|
||||
int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
|
||||
// get rvv register for a SSE reg, create the entry if needed
|
||||
int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwrite);
|
||||
// get float register for a SSE reg, but don't try to synch it if it needed to be created
|
||||
int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
|
||||
// get rvv register for an SSE reg, but don't try to synch it if it needed to be created
|
||||
int sse_get_reg_empty_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a);
|
||||
// forget float register for a SSE reg, create the entry if needed
|
||||
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
|
||||
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a);
|
||||
// forget rvv register for a SSE reg, does nothing if the regs is not loaded
|
||||
void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a);
|
||||
// purge the XMM0..XMM7 cache (before function call)
|
||||
void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1);
|
||||
// Push current value to the cache
|
||||
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int a);
|
||||
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a);
|
||||
|
||||
// common coproc helpers
|
||||
// reset the cache with n
|
||||
@ -1489,6 +1517,8 @@ uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
|
||||
uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
|
||||
uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
|
||||
|
||||
uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
|
||||
|
||||
#if STEP < 2
|
||||
#define PASS2(A)
|
||||
#else
|
||||
|
@ -18,6 +18,9 @@ typedef struct instsize_s instsize_t;
|
||||
#define EXT_CACHE_SS 5
|
||||
#define EXT_CACHE_SD 6
|
||||
#define EXT_CACHE_SCR 7
|
||||
#define EXT_CACHE_XMMW 8
|
||||
#define EXT_CACHE_XMMR 9
|
||||
|
||||
typedef union ext_cache_s {
|
||||
int8_t v;
|
||||
struct {
|
||||
@ -25,13 +28,18 @@ typedef union ext_cache_s {
|
||||
uint8_t n:4; // reg number
|
||||
};
|
||||
} ext_cache_t;
|
||||
|
||||
typedef union sse_cache_s {
|
||||
int8_t v;
|
||||
int16_t v;
|
||||
struct {
|
||||
uint8_t reg:7;
|
||||
uint8_t single:1;
|
||||
uint16_t reg : 7;
|
||||
uint16_t vector : 1;
|
||||
uint16_t single : 1;
|
||||
uint16_t write : 1;
|
||||
uint16_t unused : 7;
|
||||
};
|
||||
} sse_cache_t;
|
||||
|
||||
typedef union sse_old_s {
|
||||
int8_t v;
|
||||
struct {
|
||||
@ -41,6 +49,7 @@ typedef union sse_old_s {
|
||||
uint8_t single:1;
|
||||
};
|
||||
} sse_old_t;
|
||||
|
||||
typedef struct extcache_s {
|
||||
// ext cache
|
||||
ext_cache_t extcache[24];
|
||||
|
@ -1206,6 +1206,23 @@ f28–31 ft8–11 FP temporaries Caller
|
||||
|
||||
// Vector extension emitter
|
||||
|
||||
#define VECTOR_SEW8 0b000
|
||||
#define VECTOR_SEW16 0b001
|
||||
#define VECTOR_SEW32 0b010
|
||||
#define VECTOR_SEW64 0b011
|
||||
|
||||
#define VECTOR_MASKED 0
|
||||
#define VECTOR_UNMASKED 1
|
||||
|
||||
#define VECTOR_NFIELD1 0b000
|
||||
#define VECTOR_NFIELD2 0b001
|
||||
#define VECTOR_NFIELD3 0b010
|
||||
#define VECTOR_NFIELD4 0b011
|
||||
#define VECTOR_NFIELD5 0b100
|
||||
#define VECTOR_NFIELD6 0b101
|
||||
#define VECTOR_NFIELD7 0b110
|
||||
#define VECTOR_NFIELD8 0b111
|
||||
|
||||
// configuration setting
|
||||
// https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc
|
||||
#define VSETIVLI(rd, zimm, zimm10) EMIT(I_type(0b110000000000 | (zimm10), zimm, 0b111, rd, 0b1010111)) // 11...............111.....1010111
|
||||
|
@ -69,9 +69,10 @@ void RV64_Detect_Function()
|
||||
rv64_zbs = Check(my_block);
|
||||
|
||||
// Test Vector v1.0 with CSRR zero, vcsr
|
||||
block = (uint32_t*)my_block;
|
||||
CSRRS(xZR, xZR, 0x00f);
|
||||
BR(xRA);
|
||||
rv64_vector = Check(my_block);
|
||||
rv64_vector = Check(my_block); // TODO: also check vlen >= 128
|
||||
|
||||
// THead vendor extensions
|
||||
if (!rv64_zba) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user