[RV64_DYNAREC] Added preliminary RVV infra and PXOR opcode for demonstration (#1632)

* [RV64_DYNAREC] Added preliminary RVV infra and PXOR opcode for demonstration

* keep sse_cache_s uint8_t as suggested

* use xor to do the wrap

* revert

* better fallback
This commit is contained in:
Yang Liu 2024-07-04 18:39:05 +08:00 committed by GitHub
parent b5946f3752
commit eb695d5553
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 426 additions and 99 deletions

View File

@ -146,9 +146,10 @@ jobs:
cd build
if [[ ${{ matrix.platform }} == 'RISCV' ]]; then
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ ctest -j$(nproc) --output-on-failure
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ BOX64_DYNAREC=0 ctest -j$(nproc) --output-on-failure
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,zba=true,zbb=true,zbc=true,zbs=true ctest -j$(nproc) --output-on-failure
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,xtheadba=true,xtheadba=true,xtheadbb=true,xtheadbs=true,xtheadcondmov=true,xtheadmemidx=true,xtheadmempair=true,xtheadfmemidx=true,xtheadmac=true,xtheadfmv=true ctest -j$(nproc) --output-on-failure
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=false BOX64_DYNAREC=0 ctest -j$(nproc) --output-on-failure
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=false,zba=true,zbb=true,zbc=true,zbs=true ctest -j$(nproc) --output-on-failure
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=true,vlen=128,vext_spec=v1.0 ctest -j$(nproc) --output-on-failure
INTERPRETER=qemu-riscv64-static QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/ QEMU_CPU=rv64,v=false,xtheadba=true,xtheadba=true,xtheadbb=true,xtheadbs=true,xtheadcondmov=true,xtheadmemidx=true,xtheadmempair=true,xtheadfmemidx=true,xtheadmac=true,xtheadfmv=true ctest -j$(nproc) --output-on-failure
elif [[ ${{ matrix.platform }} == 'LARCH64' ]]; then
INTERPRETER=qemu-loongarch64-static QEMU_LD_PREFIX=/usr/loongarch64-linux-gnu/ BOX64_DYNAREC_LA64NOEXT=1 ctest -j$(nproc) --repeat until-pass:20 --output-on-failure
INTERPRETER=qemu-loongarch64-static QEMU_LD_PREFIX=/usr/loongarch64-linux-gnu/ BOX64_DYNAREC_TEST=2 BOX64_DYNAREC_LA64NOEXT=1 ctest -j$(nproc) --repeat until-pass:20 --output-on-failure

View File

@ -821,6 +821,7 @@ if(RV64_DYNAREC)
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_df.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_f0.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_660f.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_660f_vector.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66f20f.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66f30f.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_6664.c"

View File

@ -786,7 +786,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
nextop = F8;
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
sse_reflect_reg(dyn, ninst, ed);
sse_reflect_reg(dyn, ninst, x6, ed);
ADDI(x2, xEmu, offsetof(x64emu_t, xmm[ed]));
} else {
SMREAD();
@ -796,9 +796,9 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
}
}
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
ADDI(x1, xEmu, offsetof(x64emu_t, xmm[gd]));
sse_reflect_reg(dyn, ninst, 0);
sse_reflect_reg(dyn, ninst, x6, 0);
switch (u8) {
case 0xC8:
CALL(sha1nexte, -1);
@ -878,7 +878,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
nextop = F8;
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
sse_reflect_reg(dyn, ninst, ed);
sse_reflect_reg(dyn, ninst, x6, ed);
ADDI(x2, xEmu, offsetof(x64emu_t, xmm[ed]));
} else {
SMREAD();
@ -887,7 +887,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
}
u8 = F8;
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
ADDI(x1, xEmu, offsetof(x64emu_t, xmm[gd]));
MOV32w(x3, u8);
CALL(sha1rnds4, -1);

View File

@ -38,6 +38,7 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
int64_t fixedaddress;
int unscaled;
int lock;
uintptr_t retaddr = 0;
MAYUSE(u8);
MAYUSE(u16);
MAYUSE(u64);
@ -125,7 +126,14 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
break;
case 0x0F:
switch(rep) {
case 0: addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;
case 0: {
if (rv64_vector) {
retaddr = dynarec64_660F_vector(dyn, addr, ip, ninst, rex, ok, need_epilog);
addr = retaddr ? retaddr : dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
} else
addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
break;
}
case 1: addr = dynarec64_66F20F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;
case 2: addr = dynarec64_66F30F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;
}

View File

@ -511,7 +511,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
nextop = F8;
GETGX();
GETEX(x2, 0);
sse_forget_reg(dyn, ninst, 0); // forget xmm[0]
sse_forget_reg(dyn, ninst, x6, 0); // forget xmm[0]
for (int i = 0; i < 16; ++i) {
LB(x3, xEmu, offsetof(x64emu_t, xmm[0]) + i);
BGE(x3, xZR, 12); // continue
@ -920,11 +920,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
SETFLAGS(X_ALL, SF_SET_DF);
nextop = F8;
GETG;
sse_reflect_reg(dyn, ninst, gd);
sse_reflect_reg(dyn, ninst, x6, gd);
ADDI(x3, xEmu, offsetof(x64emu_t, xmm[gd]));
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
sse_reflect_reg(dyn, ninst, ed);
sse_reflect_reg(dyn, ninst, x6, ed);
ADDI(x1, xEmu, offsetof(x64emu_t, xmm[ed]));
} else {
addr = geted(dyn, addr, ninst, nextop, &wback, x1, x2, &fixedaddress, rex, NULL, 0, 1);
@ -957,7 +957,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
GETGX();
GETEX(x2, 0);
SSE_LOOP_MV_Q(x3);
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
MOV32w(x1, gd);
CALL(native_aesimc, -1);
break;
@ -965,7 +965,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
INST_NAME("AESENC Gx, Ex"); // AES-NI
nextop = F8;
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
MOV32w(x1, gd);
CALL(native_aese, -1);
GETGX();
@ -976,7 +976,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
INST_NAME("AESENCLAST Gx, Ex"); // AES-NI
nextop = F8;
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
MOV32w(x1, gd);
CALL(native_aeselast, -1);
GETGX();
@ -987,7 +987,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
INST_NAME("AESDEC Gx, Ex"); // AES-NI
nextop = F8;
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
MOV32w(x1, gd);
CALL(native_aesd, -1);
GETGX();
@ -999,7 +999,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
INST_NAME("AESDECLAST Gx, Ex"); // AES-NI
nextop = F8;
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
MOV32w(x1, gd);
CALL(native_aesdlast, -1);
GETGX();
@ -1333,11 +1333,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
INST_NAME("PCLMULQDQ Gx, Ex, Ib");
nextop = F8;
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
MOV32w(x1, gd); // gx
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
sse_forget_reg(dyn, ninst, ed);
sse_forget_reg(dyn, ninst, x6, ed);
MOV32w(x2, ed);
MOV32w(x3, 0); // p = NULL
} else {
@ -1355,11 +1355,11 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
INST_NAME("AESKEYGENASSIST Gx, Ex, Ib"); // AES-NI
nextop = F8;
GETG;
sse_forget_reg(dyn, ninst, gd);
sse_forget_reg(dyn, ninst, x6, gd);
MOV32w(x1, gd); // gx
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
sse_forget_reg(dyn, ninst, ed);
sse_forget_reg(dyn, ninst, x6, ed);
MOV32w(x2, ed);
MOV32w(x3, 0); // p = NULL
} else {

View File

@ -0,0 +1,70 @@
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <errno.h>
#include "debug.h"
#include "box64context.h"
#include "dynarec.h"
#include "emu/x64emu_private.h"
#include "emu/x64run_private.h"
#include "x64run.h"
#include "x64emu.h"
#include "box64stack.h"
#include "callback.h"
#include "emu/x64run_private.h"
#include "x64trace.h"
#include "dynarec_native.h"
#include "bitutils.h"
#include "rv64_printer.h"
#include "dynarec_rv64_private.h"
#include "dynarec_rv64_functions.h"
#include "dynarec_rv64_helper.h"
uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)
{
(void)ip;
(void)need_epilog;
uint8_t opcode = F8;
uint8_t nextop, u8, s8;
int32_t i32;
uint8_t gd, ed;
uint8_t wback, wb1, wb2, gback;
uint8_t eb1, eb2;
int64_t j64;
uint64_t tmp64u, tmp64u2;
int v0, v1;
int q0, q1;
int d0, d1, d2;
int64_t fixedaddress, gdoffset;
int unscaled;
MAYUSE(d0);
MAYUSE(d1);
MAYUSE(q0);
MAYUSE(q1);
MAYUSE(eb1);
MAYUSE(eb2);
MAYUSE(j64);
switch (opcode) {
case 0xEF:
INST_NAME("PXOR Gx, Ex");
nextop = F8;
// FIXME: we should try to minimize vsetvl usage as it may hurts performance a lot.
vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW8);
GETG;
if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
// special case
q0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
} else {
q0 = sse_get_reg_vector(dyn, ninst, x1, gd, 1);
GETEX_vector(q1, 0, 0);
VXOR_VV(q0, q0, q1, VECTOR_UNMASKED);
}
break;
default:
// fallback to the scalar version
return 0;
}
return addr;
}

View File

@ -402,13 +402,16 @@ void extcacheUnwind(extcache_t* cache)
}
// add/change bad regs
for(int i=0; i<16; ++i) {
if(cache->olds[i].changed) {
cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD;
} else if(cache->olds[i].purged) {
cache->extcache[i].n = i;
cache->extcache[i].t = cache->olds[i].single?EXT_CACHE_SS:EXT_CACHE_SD;
if (cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_SD) {
if (cache->olds[i].changed) {
cache->extcache[i].t = cache->olds[i].single ? EXT_CACHE_SS : EXT_CACHE_SD;
} else if (cache->olds[i].purged) {
cache->extcache[i].n = i;
cache->extcache[i].t = cache->olds[i].single ? EXT_CACHE_SS : EXT_CACHE_SD;
}
}
}
if(cache->stack_push) {
// unpush
for(int j=0; j<24; ++j) {
@ -465,14 +468,23 @@ void extcacheUnwind(extcache_t* cache)
break;
case EXT_CACHE_SS:
cache->ssecache[cache->extcache[i].n].reg = EXTREG(i);
cache->ssecache[cache->extcache[i].n].vector = 0;
cache->ssecache[cache->extcache[i].n].single = 1;
++cache->fpu_reg;
break;
case EXT_CACHE_SD:
cache->ssecache[cache->extcache[i].n].reg = EXTREG(i);
cache->ssecache[cache->extcache[i].n].vector = 0;
cache->ssecache[cache->extcache[i].n].single = 0;
++cache->fpu_reg;
break;
case EXT_CACHE_XMMR:
case EXT_CACHE_XMMW:
cache->ssecache[cache->extcache[i].n].reg = i;
cache->ssecache[cache->extcache[i].n].vector = 1;
cache->ssecache[cache->extcache[i].n].write = (cache->extcache[i].t == EXT_CACHE_XMMW) ? 1 : 0;
++cache->fpu_reg;
break;
case EXT_CACHE_ST_F:
case EXT_CACHE_ST_D:
case EXT_CACHE_ST_I64:
@ -556,6 +568,8 @@ const char* getCacheName(int t, int n)
case EXT_CACHE_SS: sprintf(buff, "SS%d", n); break;
case EXT_CACHE_SD: sprintf(buff, "SD%d", n); break;
case EXT_CACHE_SCR: sprintf(buff, "Scratch"); break;
case EXT_CACHE_XMMW: sprintf(buff, "XMM%d", n); break;
case EXT_CACHE_XMMR: sprintf(buff, "xmm%d", n); break;
case EXT_CACHE_NONE: buff[0]='\0'; break;
}
return buff;
@ -570,6 +584,12 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
"fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fs8", "fs9", "fs10", "fs11",
"ft8", "ft9", "ft10", "ft11"
};
static const char* vnames[] = {
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v8", "v9", "v30", "v31",
};
if(box64_dynarec_dump) {
printf_x64_instruction(rex.is32bits?my_context->dec32:my_context->dec, &dyn->insts[ninst].x64, name);
dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, barrier=%d state=%d/%d(%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d/%d",
@ -607,6 +627,8 @@ void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t r
case EXT_CACHE_MM: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
case EXT_CACHE_SS: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
case EXT_CACHE_SD: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
case EXT_CACHE_XMMR: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
case EXT_CACHE_XMMW: dynarec_log(LOG_NONE, " %s:%s", vnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
case EXT_CACHE_SCR: dynarec_log(LOG_NONE, " %s:%s", fnames[EXTREG(ii)], getCacheName(dyn->insts[ninst].e.extcache[ii].t, dyn->insts[ninst].e.extcache[ii].n)); break;
case EXT_CACHE_NONE:
default: break;
@ -689,4 +711,4 @@ void fpu_reset_ninst(dynarec_rv64_t* dyn, int ninst)
int fpu_is_st_freed(dynarec_rv64_t* dyn, int ninst, int st)
{
return (dyn->e.tags&(0b11<<(st*2)))?1:0;
}
}

View File

@ -1570,10 +1570,15 @@ static void mmx_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
// get ext register for a SSE reg, create the entry if needed
int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
{
if(dyn->e.ssecache[a].v!=-1) {
if (dyn->e.ssecache[a].v != -1) {
if (dyn->e.ssecache[a].vector == 1) {
// it's in the fpu, forget it first...
sse_forget_reg_vector(dyn, ninst, s1, a);
return sse_get_reg(dyn, ninst, s1, a, single);
}
// forget / reload if change of size
if(dyn->e.ssecache[a].single!=single) {
sse_forget_reg(dyn, ninst, a);
sse_forget_reg(dyn, ninst, s1, a);
// update olds after the forget...
dyn->e.olds[a].changed = 1;
dyn->e.olds[a].purged = 0;
@ -1585,41 +1590,52 @@ int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a);
int ret = dyn->e.ssecache[a].reg;
dyn->e.ssecache[a].single = single;
dyn->e.ssecache[a].vector = 0;
if(dyn->e.ssecache[a].single)
FLW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
else
FLD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
return ret;
}
// get ext register for a SSE reg, but don't try to synch it if it needed to be created
int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single)
{
if(dyn->e.ssecache[a].v!=-1) {
if(dyn->e.ssecache[a].single!=single) {
if (dyn->e.ssecache[a].v != -1) {
if (dyn->e.ssecache[a].vector == 1) {
// it's in the fpu, forget it first...
sse_forget_reg_vector(dyn, ninst, s1, a);
return sse_get_reg_empty(dyn, ninst, s1, a, single);
}
if (dyn->e.ssecache[a].single != single) {
if (single) {
// writing back the double
// writing back the double, to clear upper 32 bit.
FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
// need to wipe the half high 32bits of old Double because we now have a single
//SW(xZR, xEmu, offsetof(x64emu_t, xmm[a])+4);
}
dyn->e.olds[a].changed = 1;
dyn->e.olds[a].purged = 0;
dyn->e.olds[a].reg = EXTIDX(dyn->e.ssecache[a].reg);
dyn->e.olds[a].single = 1-single;
dyn->e.ssecache[a].single = single;
dyn->e.ssecache[a].vector = 0;
dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = single?EXT_CACHE_SS:EXT_CACHE_SD;
}
return dyn->e.ssecache[a].reg;
}
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, single?EXT_CACHE_SS:EXT_CACHE_SD, a);
dyn->e.ssecache[a].single = single;
dyn->e.ssecache[a].vector = 0;
return dyn->e.ssecache[a].reg;
}
// forget ext register for a SSE reg, does nothing if the regs is not loaded
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a)
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a)
{
if(dyn->e.ssecache[a].v==-1)
if (dyn->e.ssecache[a].v == -1)
return;
if (dyn->e.ssecache[a].vector == 1)
return sse_forget_reg_vector(dyn, ninst, s1, a);
if(dyn->e.ssecache[a].single)
FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
else
@ -1632,24 +1648,93 @@ void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a)
dyn->e.ssecache[a].v = -1;
return;
}
// get rvv register for a SSE reg, create the entry if needed
int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwrite)
{
if (dyn->e.ssecache[a].v != -1) {
if (dyn->e.ssecache[a].vector == 0) {
// it's in the fpu, forget it first...
sse_forget_reg(dyn, ninst, s1, a);
return sse_get_reg_vector(dyn, ninst, s1, a, forwrite);
}
if (forwrite) {
dyn->e.ssecache[a].write = 1; // update only if forwrite
dyn->e.ssecache[a].single = 0; // just to be clean
dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = EXT_CACHE_XMMW;
}
return dyn->e.ssecache[a].reg;
}
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, forwrite ? EXT_CACHE_XMMW : EXT_CACHE_XMMR, a);
int ret = dyn->e.ssecache[a].reg;
dyn->e.ssecache[a].write = forwrite;
dyn->e.ssecache[a].vector = 1;
dyn->e.ssecache[a].single = 0; // just to be clean
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
VLE8_V(ret, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
return ret;
}
// get rvv register for an SSE reg, but don't try to synch it if it needed to be created
int sse_get_reg_empty_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a)
{
if (dyn->e.ssecache[a].v != -1) {
if (dyn->e.ssecache[a].vector == 0) {
// it's in the fpu, forget it first...
sse_forget_reg(dyn, ninst, s1, a);
return sse_get_reg_empty_vector(dyn, ninst, s1, a);
}
dyn->e.ssecache[a].vector = 1;
dyn->e.ssecache[a].write = 1;
dyn->e.ssecache[a].single = 0; // just to be clean
dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t = EXT_CACHE_XMMW;
return dyn->e.ssecache[a].reg;
}
dyn->e.ssecache[a].reg = fpu_get_reg_xmm(dyn, EXT_CACHE_XMMW, a);
dyn->e.ssecache[a].vector = 1;
dyn->e.ssecache[a].single = 0; // just to be clean
dyn->e.ssecache[a].write = 1; // it will be write...
return dyn->e.ssecache[a].reg;
}
// forget rvv register for a SSE reg, does nothing if the regs is not loaded
void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a)
{
if (dyn->e.ssecache[a].v == -1)
return;
if (dyn->e.ssecache[a].vector == 0)
return sse_forget_reg(dyn, ninst, s1, a);
if (dyn->e.extcache[EXTIDX(dyn->e.ssecache[a].reg)].t == EXT_CACHE_XMMW) {
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
}
fpu_free_reg(dyn, dyn->e.ssecache[a].reg);
dyn->e.ssecache[a].v = -1;
return;
}
// purge the SSE cache for XMM0..XMM7 (to use before function native call)
void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1)
{
int old = -1;
for (int i=0; i<8; ++i)
if(dyn->e.ssecache[i].v!=-1) {
if (old==-1) {
for (int i = 0; i < 8; ++i)
if (dyn->e.ssecache[i].v != -1) {
if (old == -1) {
MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n");
++old;
}
if(dyn->e.ssecache[i].single)
if (dyn->e.ssecache[i].vector) {
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
} else if (dyn->e.ssecache[i].single)
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
else
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
dyn->e.ssecache[i].v = -1;
}
if(old!=-1) {
if (old != -1) {
MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n");
}
}
@ -1664,17 +1749,25 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next?"locally ":"");
++old;
}
if(dyn->e.ssecache[i].single)
if (dyn->e.ssecache[i].vector) {
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
} else if (dyn->e.ssecache[i].single)
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
else
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
if(!next) {
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
dyn->e.olds[i].changed = 0;
dyn->e.olds[i].purged = 1;
dyn->e.olds[i].reg = dyn->e.ssecache[i].reg;
dyn->e.olds[i].single = dyn->e.ssecache[i].single;
dyn->e.ssecache[i].v = -1;
if (dyn->e.ssecache[i].vector) {
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
dyn->e.ssecache[i].v = -1;
} else {
fpu_free_reg(dyn, dyn->e.ssecache[i].reg);
dyn->e.olds[i].changed = 0;
dyn->e.olds[i].purged = 1;
dyn->e.olds[i].reg = dyn->e.ssecache[i].reg;
dyn->e.olds[i].single = dyn->e.ssecache[i].single;
dyn->e.ssecache[i].v = -1;
}
}
}
if(old!=-1) {
@ -1684,20 +1777,26 @@ static void sse_purgecache(dynarec_rv64_t* dyn, int ninst, int next, int s1)
static void sse_reflectcache(dynarec_rv64_t* dyn, int ninst, int s1)
{
for (int i=0; i<16; ++i)
if(dyn->e.ssecache[i].v!=-1) {
if(dyn->e.ssecache[i].single)
for (int i = 0; i < 16; ++i)
if (dyn->e.ssecache[i].v != -1) {
if (dyn->e.ssecache[i].vector) {
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
} else if (dyn->e.ssecache[i].single)
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
else
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
}
}
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int a)
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a)
{
if (dyn->e.ssecache[a].v == -1)
return;
if (dyn->e.ssecache[a].single)
if (dyn->e.ssecache[a].vector) {
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[a]));
VSE8_V(dyn->e.ssecache[a].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
} else if (dyn->e.ssecache[a].single)
FSW(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
else
FSD(dyn->e.ssecache[a].reg, xEmu, offsetof(x64emu_t, xmm[a]));
@ -1717,7 +1816,10 @@ void fpu_pushcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
MESSAGE(LOG_DUMP, "\tPush XMM Cache (%d)------\n", n);
for (int i=start; i<8; ++i)
if(dyn->e.ssecache[i].v!=-1) {
if(dyn->e.ssecache[i].single)
if (dyn->e.ssecache[i].vector) {
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
VSE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
} else if (dyn->e.ssecache[i].single)
FSW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
else
FSD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
@ -1760,7 +1862,10 @@ void fpu_popcache(dynarec_rv64_t* dyn, int ninst, int s1, int not07)
MESSAGE(LOG_DUMP, "\tPop XMM Cache (%d)------\n", n);
for (int i=start; i<8; ++i)
if(dyn->e.ssecache[i].v!=-1) {
if(dyn->e.ssecache[i].single)
if (dyn->e.ssecache[i].vector) {
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[i]));
VLE8_V(dyn->e.ssecache[i].reg, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
} else if (dyn->e.ssecache[i].single)
FLW(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
else
FLD(dyn->e.ssecache[i].reg, xEmu, offsetof(x64emu_t, xmm[i]));
@ -1829,6 +1934,14 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_
if (t == EXT_CACHE_ST_D)
return i;
break;
case EXT_CACHE_XMMR:
if (t == EXT_CACHE_XMMW)
return i;
break;
case EXT_CACHE_XMMW:
if (t == EXT_CACHE_XMMR)
return i;
break;
}
}
}
@ -1837,16 +1950,33 @@ static int findCacheSlot(dynarec_rv64_t* dyn, int ninst, int t, int n, extcache_
static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *cache)
{
if (i==j)
if (i == j) return;
if (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW || cache->extcache[j].t == EXT_CACHE_XMMR || cache->extcache[j].t == EXT_CACHE_XMMW) {
if (!cache->extcache[i].v) {
// a mov is enough, no need to swap
MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j);
VOR_VV(i, j, j, VECTOR_UNMASKED);
cache->extcache[i].v = cache->extcache[j].v;
cache->extcache[j].v = 0;
return;
}
// SWAP
ext_cache_t tmp;
MESSAGE(LOG_DUMP, "\t - Swapping %d <-> %d\n", i, j);
VXOR_VV(i, i, j, VECTOR_UNMASKED);
VXOR_VV(j, i, j, VECTOR_UNMASKED);
VXOR_VV(i, i, j, VECTOR_UNMASKED);
tmp.v = cache->extcache[i].v;
cache->extcache[i].v = cache->extcache[j].v;
cache->extcache[j].v = tmp.v;
return;
}
int reg_i = EXTREG(i);
int reg_j = EXTREG(j);
int i_single = 0;
if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F)
i_single =1;
int j_single = 0;
if(cache->extcache[j].t==EXT_CACHE_SS || cache->extcache[j].t==EXT_CACHE_ST_F)
j_single =1;
int i_single = cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_ST_F;
int j_single = cache->extcache[j].t == EXT_CACHE_SS || cache->extcache[j].t == EXT_CACHE_ST_F;
if(!cache->extcache[i].v) {
// a mov is enough, no need to swap
@ -1887,17 +2017,22 @@ static void swapCache(dynarec_rv64_t* dyn, int ninst, int i, int j, extcache_t *
static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, extcache_t* cache, int i, int t, int n)
{
int reg = EXTREG(i);
if(cache->extcache[i].v) {
int single = 0;
if(t==EXT_CACHE_SS || t==EXT_CACHE_ST_F)
single = 1;
if(cache->extcache[i].t==EXT_CACHE_SS || cache->extcache[i].t==EXT_CACHE_ST_F)
single = 1;
int j = i+1;
while(cache->extcache[j].v)
++j;
if (cache->extcache[i].v && (cache->extcache[i].t == EXT_CACHE_XMMR || cache->extcache[i].t == EXT_CACHE_XMMW)) {
int j = i + 1;
while (cache->extcache[j].v) ++j;
MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i);
if(single) {
VOR_VV(j, i, i, VECTOR_UNMASKED);
cache->extcache[j].v = cache->extcache[i].v;
} else if (cache->extcache[i].v) {
int single = 0;
if (t == EXT_CACHE_SS || t == EXT_CACHE_ST_F)
single = 1;
if (cache->extcache[i].t == EXT_CACHE_SS || cache->extcache[i].t == EXT_CACHE_ST_F)
single = 1;
int j = i + 1;
while (cache->extcache[j].v) ++j;
MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i);
if (single) {
FMVS(EXTREG(j), reg);
} else {
FMVD(EXTREG(j), reg);
@ -1905,6 +2040,12 @@ static void loadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, int
cache->extcache[j].v = cache->extcache[i].v;
}
switch(t) {
case EXT_CACHE_XMMR:
case EXT_CACHE_XMMW:
MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n));
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
VLE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
break;
case EXT_CACHE_SS:
MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n));
FLW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
@ -1956,6 +2097,14 @@ static void unloadCache(dynarec_rv64_t* dyn, int ninst, int stack_cnt, int s1, i
{
int reg = EXTREG(i);
switch(t) {
case EXT_CACHE_XMMR:
MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n));
break;
case EXT_CACHE_XMMW:
MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n));
ADDI(s1, xEmu, offsetof(x64emu_t, xmm[n]));
VSE8_V(i, s1, VECTOR_UNMASKED, VECTOR_NFIELD1);
break;
case EXT_CACHE_SS:
MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n));
FSW(reg, xEmu, offsetof(x64emu_t, xmm[n]));
@ -2045,43 +2194,47 @@ static void fpuCacheTransform(dynarec_rv64_t* dyn, int ninst, int s1, int s2, in
int s2_val = 0;
// unload every uneeded cache
// check SSE first, than MMX, in order, for optimisation issue
for(int i=0; i<16; ++i) {
int j=findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache);
if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2)==-1)
if (rv64_vector) vector_vsetvl_emul1(dyn, ninst, s1, VECTOR_SEW8);
for (int i = 0; i < 16; ++i) {
int j = findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache);
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SS, i, &cache_i2) == -1)
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
j=findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache);
if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2)==-1)
j = findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache);
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_SD, i, &cache_i2) == -1)
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
j = findCacheSlot(dyn, ninst, EXT_CACHE_XMMW, i, &cache);
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_XMMW, i, &cache_i2) == -1)
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
}
for(int i=0; i<8; ++i) {
int j=findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache);
if(j>=0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2)==-1)
for (int i = 0; i < 8; ++i) {
int j = findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache);
if (j >= 0 && findCacheSlot(dyn, ninst, EXT_CACHE_MM, i, &cache_i2) == -1)
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.extcache[j].t, cache.extcache[j].n);
}
for(int i=0; i<24; ++i) {
for (int i = 0; i < 24; ++i) {
if(cache.extcache[i].v)
if(findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2)==-1)
if (findCacheSlot(dyn, ninst, cache.extcache[i].t, cache.extcache[i].n, &cache_i2) == -1)
unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.extcache[i].t, cache.extcache[i].n);
}
// and now load/swap the missing one
for(int i=0; i<24; ++i) {
if(cache_i2.extcache[i].v) {
if(cache_i2.extcache[i].v != cache.extcache[i].v) {
for (int i = 0; i < 24; ++i) {
if (cache_i2.extcache[i].v) {
if (cache_i2.extcache[i].v != cache.extcache[i].v) {
int j;
if((j=findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache))==-1)
if ((j = findCacheSlot(dyn, ninst, cache_i2.extcache[i].t, cache_i2.extcache[i].n, &cache)) == -1)
loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.extcache[i].t, cache_i2.extcache[i].n);
else {
// it's here, lets swap if needed
if(j!=i)
if (j != i)
swapCache(dyn, ninst, i, j, &cache);
}
}
if(cache.extcache[i].t != cache_i2.extcache[i].t) {
if(cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) {
if (cache.extcache[i].t != cache_i2.extcache[i].t) {
if (cache.extcache[i].t == EXT_CACHE_ST_D && cache_i2.extcache[i].t == EXT_CACHE_ST_F) {
MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
FCVTSD(EXTREG(i), EXTREG(i));
cache.extcache[i].t = EXT_CACHE_ST_F;
} else if(cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) {
} else if (cache.extcache[i].t == EXT_CACHE_ST_F && cache_i2.extcache[i].t == EXT_CACHE_ST_D) {
MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.extcache[i].t, cache.extcache[i].n));
FCVTDS(EXTREG(i), EXTREG(i));
cache.extcache[i].t = EXT_CACHE_ST_D;
@ -2331,3 +2484,18 @@ void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst)
dyn->e.stack_push = 0;
dyn->e.swapped = 0;
}
// Use vector extension as like SIMD for now, this function sets the specified element width,
// other configs are set automatically.
void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew)
{
/* mu: mask undisturbed
* tu: tail undisturbed
* sew: selected element width
* lmul: vector register group multiplier
*
* mu tu sew lmul=1 */
uint32_t vtypei = (0b0 << 7) | (0b0 << 6) | (sew << 3) | 0b000;
ADDI(s1, xZR, 16 >> sew);
VSETVLI(xZR, s1, vtypei);
}

View File

@ -473,10 +473,10 @@
FLD(a, ed, fixedaddress); \
}
// Will get pointer to GX in general register a, will purge SS or SD if loaded. can use gback as load address
// Will get pointer to GX in general register a, will purge SS or SD if loaded. May use x3. can use gback as load address
#define GETGX() \
gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \
sse_forget_reg(dyn, ninst, gd); \
sse_forget_reg(dyn, ninst, x3, gd); \
gback = xEmu; \
gdoffset = offsetof(x64emu_t, xmm[gd])
@ -484,7 +484,7 @@
#define GETEX(a, D) \
if (MODREG) { \
ed = (nextop & 7) + (rex.b << 3); \
sse_forget_reg(dyn, ninst, ed); \
sse_forget_reg(dyn, ninst, x3, ed); \
fixedaddress = offsetof(x64emu_t, xmm[ed]); \
wback = xEmu; \
} else { \
@ -494,6 +494,18 @@
fixedaddress = 0; /* TODO: optimize this! */ \
}
// Get EX as a quad, (x1 is used)
#define GETEX_vector(a, w, D) \
if (MODREG) { \
a = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w); \
} else { \
SMREAD(); \
addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 1, D); \
a = fpu_get_scratch(dyn); \
ADDI(x2, ed, fixedaddress); \
VLE8_V(a, x2, VECTOR_UNMASKED, VECTOR_NFIELD1); \
}
#define GETGM() \
gd = ((nextop & 0x38) >> 3); \
mmx_forget_reg(dyn, ninst, gd); \
@ -1093,6 +1105,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
#define dynarec64_F20F STEPNAME(dynarec64_F20F)
#define dynarec64_F30F STEPNAME(dynarec64_F30F)
#define dynarec64_660F_vector STEPNAME(dynarec64_660F_vector)
#define geted STEPNAME(geted)
#define geted32 STEPNAME(geted32)
#define geted16 STEPNAME(geted16)
@ -1223,6 +1237,10 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
#define sse_purge07cache STEPNAME(sse_purge07cache)
#define sse_reflect_reg STEPNAME(sse_reflect_reg)
#define sse_get_reg_empty_vector STEPNAME(sse_get_reg_empty_vector)
#define sse_get_reg_vector STEPNAME(sse_get_reg_vector)
#define sse_forget_reg_vector STEPNAME(sse_forget_reg_vector)
#define fpu_pushcache STEPNAME(fpu_pushcache)
#define fpu_popcache STEPNAME(fpu_popcache)
#define fpu_reset_cache STEPNAME(fpu_reset_cache)
@ -1238,6 +1256,8 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
#define rv64_move64 STEPNAME(rv64_move64)
#define rv64_move32 STEPNAME(rv64_move32)
#define vector_vsetvl_emul1 STEPNAME(vector_vsetvl_emul1)
/* setup r2 to address pointed by */
uintptr_t geted(dynarec_rv64_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta);
@ -1392,6 +1412,8 @@ void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2
void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val);
void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zeroup);
void vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew);
#if STEP < 2
#define CHECK_CACHE() 0
#else
@ -1435,14 +1457,20 @@ void mmx_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
// SSE/SSE2 helpers
// get float register for a SSE reg, create the entry if needed
int sse_get_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
// get rvv register for a SSE reg, create the entry if needed
int sse_get_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a, int forwrite);
// get float register for a SSE reg, but don't try to synch it if it needed to be created
int sse_get_reg_empty(dynarec_rv64_t* dyn, int ninst, int s1, int a, int single);
// get rvv register for an SSE reg, but don't try to synch it if it needed to be created
int sse_get_reg_empty_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a);
// forget float register for a SSE reg, create the entry if needed
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int a);
void sse_forget_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a);
// forget rvv register for a SSE reg, does nothing if the regs is not loaded
void sse_forget_reg_vector(dynarec_rv64_t* dyn, int ninst, int s1, int a);
// purge the XMM0..XMM7 cache (before function call)
void sse_purge07cache(dynarec_rv64_t* dyn, int ninst, int s1);
// Push current value to the cache
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int a);
void sse_reflect_reg(dynarec_rv64_t* dyn, int ninst, int s1, int a);
// common coproc helpers
// reset the cache with n
@ -1489,6 +1517,8 @@ uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
#if STEP < 2
#define PASS2(A)
#else

View File

@ -18,6 +18,9 @@ typedef struct instsize_s instsize_t;
#define EXT_CACHE_SS 5
#define EXT_CACHE_SD 6
#define EXT_CACHE_SCR 7
#define EXT_CACHE_XMMW 8
#define EXT_CACHE_XMMR 9
typedef union ext_cache_s {
int8_t v;
struct {
@ -25,13 +28,18 @@ typedef union ext_cache_s {
uint8_t n:4; // reg number
};
} ext_cache_t;
typedef union sse_cache_s {
int8_t v;
int16_t v;
struct {
uint8_t reg:7;
uint8_t single:1;
uint16_t reg : 7;
uint16_t vector : 1;
uint16_t single : 1;
uint16_t write : 1;
uint16_t unused : 7;
};
} sse_cache_t;
typedef union sse_old_s {
int8_t v;
struct {
@ -41,6 +49,7 @@ typedef union sse_old_s {
uint8_t single:1;
};
} sse_old_t;
typedef struct extcache_s {
// ext cache
ext_cache_t extcache[24];

View File

@ -1206,6 +1206,23 @@ f2831 ft811 FP temporaries Caller
// Vector extension emitter
#define VECTOR_SEW8 0b000
#define VECTOR_SEW16 0b001
#define VECTOR_SEW32 0b010
#define VECTOR_SEW64 0b011
#define VECTOR_MASKED 0
#define VECTOR_UNMASKED 1
#define VECTOR_NFIELD1 0b000
#define VECTOR_NFIELD2 0b001
#define VECTOR_NFIELD3 0b010
#define VECTOR_NFIELD4 0b011
#define VECTOR_NFIELD5 0b100
#define VECTOR_NFIELD6 0b101
#define VECTOR_NFIELD7 0b110
#define VECTOR_NFIELD8 0b111
// configuration setting
// https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc
#define VSETIVLI(rd, zimm, zimm10) EMIT(I_type(0b110000000000 | (zimm10), zimm, 0b111, rd, 0b1010111)) // 11...............111.....1010111

View File

@ -69,9 +69,10 @@ void RV64_Detect_Function()
rv64_zbs = Check(my_block);
// Test Vector v1.0 with CSRR zero, vcsr
block = (uint32_t*)my_block;
CSRRS(xZR, xZR, 0x00f);
BR(xRA);
rv64_vector = Check(my_block);
rv64_vector = Check(my_block); // TODO: also check vlen >= 128
// THead vendor extensions
if (!rv64_zba) {