box64/src/dynarec/arm64/dynarec_arm64_functions.c

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <math.h>
#include <signal.h>
#include <sys/types.h>
#include <unistd.h>

#include "debug.h"
#include "box64context.h"
#include "box64cpu.h"
#include "emu/x64emu_private.h"
#include "x64emu.h"
#include "box64stack.h"
#include "callback.h"
#include "emu/x64run_private.h"
#include "emu/x87emu_private.h"
#include "x64trace.h"
#include "dynarec_native.h"
#include "dynarec_arm64_private.h"
#include "dynarec_arm64_functions.h"
#include "custommem.h"
#include "bridge.h"
#include "gdbjit.h"
#include "perfmap.h"

// Get a FPU scratch reg
int fpu_get_scratch(dynarec_arm_t* dyn, int ninst)
{
    int ret = SCRATCH0 + dyn->n.fpu_scratch++;
    if(dyn->n.ymm_used) printf_log(LOG_INFO, "Warning, getting a scratch register after getting some YMM at inst=%d\n", ninst);
    if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) {
        // should only happens in step 0...
        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged
        dyn->n.neoncache[ret].v = 0; // reset it
    }
    return ret;
}
// Get 2 consicutive FPU scratch reg
int fpu_get_double_scratch(dynarec_arm_t* dyn, int ninst)
{
    int ret = SCRATCH0 + dyn->n.fpu_scratch;
    if(dyn->n.ymm_used) printf_log(LOG_INFO, "Warning, getting a double scratch register after getting some YMM at inst=%d\n", ninst);
    if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) {
        // should only happens in step 0...
        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged
        dyn->n.neoncache[ret].v = 0; // reset it
    }
    if(dyn->n.neoncache[ret+1].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret+1].t==NEON_CACHE_YMMW) {
        // should only happens in step 0...
        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret+1].n); // mark as purged
        dyn->n.neoncache[ret+1].v = 0; // reset it
    }
    dyn->n.fpu_scratch+=2;
    return ret;
}
// Reset scratch regs counter
void fpu_reset_scratch(dynarec_arm_t* dyn)
{
    dyn->n.fpu_scratch = 0;
    dyn->n.ymm_used = 0;
    dyn->n.ymm_regs = 0;
    dyn->n.ymm_write = 0;
    dyn->n.ymm_removed = 0;
    dyn->n.xmm_write = 0;
    dyn->n.xmm_removed = 0;
}
// Get a x87 double reg
int fpu_get_reg_x87(dynarec_arm_t* dyn, int ninst, int t, int n)
{
    int i=X870;
    while (dyn->n.fpuused[i]) ++i;
    if(dyn->n.neoncache[i].t==NEON_CACHE_YMMR || dyn->n.neoncache[i].t==NEON_CACHE_YMMW) {
        // should only happens in step 0...
        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[i].n); // mark as purged
        dyn->n.neoncache[i].v = 0; // reset it
    }
    dyn->n.fpuused[i] = 1;
    dyn->n.neoncache[i].n = n;
    dyn->n.neoncache[i].t = t;
    dyn->n.news |= (1<<i);
    return i; // return a Dx
}
// Free a FPU double reg
void fpu_free_reg(dynarec_arm_t* dyn, int reg)
{
    // TODO: check upper limit?
    dyn->n.fpuused[reg] = 0;
    if(dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) {
        dyn->n.ymm_removed |= 1<<dyn->n.neoncache[reg].n;
        if(dyn->n.neoncache[reg].t==NEON_CACHE_YMMW)
            dyn->n.ymm_write |= 1<<dyn->n.neoncache[reg].n;
        if(reg>SCRATCH0)
            dyn->n.ymm_regs |= (8LL+reg-SCRATCH0)<<(dyn->n.neoncache[reg].n*4);
        else
            dyn->n.ymm_regs |= ((uint64_t)(reg-EMM0))<<(dyn->n.neoncache[reg].n*4);
    }
    if(dyn->n.neoncache[reg].t==NEON_CACHE_XMMR || dyn->n.neoncache[reg].t==NEON_CACHE_XMMW) {
        dyn->n.xmm_removed |= 1<<dyn->n.neoncache[reg].n;
        if(dyn->n.neoncache[reg].t==NEON_CACHE_XMMW)
            dyn->n.xmm_write |= 1<<dyn->n.neoncache[reg].n;
    }
    if(dyn->n.neoncache[reg].t!=NEON_CACHE_ST_F && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_D && dyn->n.neoncache[reg].t!=NEON_CACHE_ST_I64)
        dyn->n.neoncache[reg].v = 0;
    if(dyn->n.fpu_scratch && reg==SCRATCH0+dyn->n.fpu_scratch-1)
        --dyn->n.fpu_scratch;
}
// Get an MMX double reg
int fpu_get_reg_emm(dynarec_arm_t* dyn, int ninst, int emm)
{
    int ret = EMM0 + emm;
    if(dyn->n.neoncache[ret].t==NEON_CACHE_YMMR || dyn->n.neoncache[ret].t==NEON_CACHE_YMMW) {
        // should only happens in step 0...
        dyn->insts[ninst].purge_ymm |= (1<<dyn->n.neoncache[ret].n); // mark as purged
        dyn->n.neoncache[ret].v = 0; // reset it
    }
    dyn->n.fpuused[ret] = 1;
    dyn->n.neoncache[ret].t = NEON_CACHE_MM;
    dyn->n.neoncache[ret].n = emm;
    dyn->n.news |= (1<<(ret));
    return ret;
}
// Get an XMM quad reg
int fpu_get_reg_xmm(dynarec_arm_t* dyn, int t, int xmm)
{
    int i;
    if(xmm>7) {
        i = XMM8 + xmm - 8;
    } else {
        i = XMM0 + xmm;
    }
    dyn->n.fpuused[i] = 1;
    dyn->n.neoncache[i].t = t;
    dyn->n.neoncache[i].n = xmm;
    dyn->n.news |= (1<<i);
    return i;
}
int internal_mark_ymm(dynarec_arm_t* dyn, int t, int ymm, int reg)
{
    if((dyn->n.neoncache[reg].t==NEON_CACHE_YMMR) || (dyn->n.neoncache[reg].t==NEON_CACHE_YMMW)) {
        if(dyn->n.neoncache[reg].n == ymm) {
            // already there!
            if(t==NEON_CACHE_YMMW)
                dyn->n.neoncache[reg].t=t;
            return reg;
        }
    } else if(!dyn->n.neoncache[reg].v) {
        // found a slot!
        dyn->n.neoncache[reg].t=t;
        dyn->n.neoncache[reg].n=ymm;
        dyn->n.news |= (1<<reg);
        return reg;
    }
    return -1;
}
int is_ymm_to_keep(dynarec_arm_t* dyn, int reg, int k1, int k2, int k3)
{
    if((k1!=-1) && (dyn->n.neoncache[reg].n==k1))
        return 1;
    if((k2!=-1) && (dyn->n.neoncache[reg].n==k2))
        return 1;
    if((k3!=-1) && (dyn->n.neoncache[reg].n==k3))
        return 1;
    if((dyn->n.neoncache[reg].t==NEON_CACHE_YMMR || dyn->n.neoncache[reg].t==NEON_CACHE_YMMW) && (dyn->n.ymm_used&(1<<dyn->n.neoncache[reg].n)))
        return 1;
    return 0;
}

// Reset fpu regs counter
static void fpu_reset_reg_neoncache(neoncache_t* n)
{
    n->fpu_reg = 0;
    for (int i=0; i<32; ++i) {
        n->fpuused[i]=0;
        n->neoncache[i].v = 0;
    }
    n->ymm_regs = 0;
    n->ymm_removed = 0;
    n->ymm_used = 0;
    n->ymm_write = 0;
    n->xmm_removed = 0;
    n->xmm_write = 0;

}
void fpu_reset_reg(dynarec_arm_t* dyn)
{
    fpu_reset_reg_neoncache(&dyn->n);
}

int neoncache_no_i64(dynarec_arm_t* dyn, int ninst, int st, int a)
{
    if(a==NEON_CACHE_ST_I64) {
        neoncache_promote_double(dyn, ninst, st);
        return NEON_CACHE_ST_D;
    }
    return a;
}

int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a)
{
    if (dyn->insts[ninst].n.swapped) {
        if(dyn->insts[ninst].n.combined1 == a)
            a = dyn->insts[ninst].n.combined2;
        else if(dyn->insts[ninst].n.combined2 == a)
            a = dyn->insts[ninst].n.combined1;
    }
    for(int i=0; i<24; ++i)
        if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F
         || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D
         || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64)
         && dyn->insts[ninst].n.neoncache[i].n==a)
            return dyn->insts[ninst].n.neoncache[i].t;
    // not in the cache yet, so will be fetched...
    return NEON_CACHE_ST_D;
}

int neoncache_get_current_st(dynarec_arm_t* dyn, int ninst, int a)
{
    (void)ninst;
    if(!dyn->insts)
        return NEON_CACHE_ST_D;
    for(int i=0; i<24; ++i)
        if((dyn->n.neoncache[i].t==NEON_CACHE_ST_F
         || dyn->n.neoncache[i].t==NEON_CACHE_ST_D
         || dyn->n.neoncache[i].t==NEON_CACHE_ST_I64)
         && dyn->n.neoncache[i].n==a)
            return dyn->n.neoncache[i].t;
    // not in the cache yet, so will be fetched...
    return NEON_CACHE_ST_D;
}

int neoncache_get_st_f(dynarec_arm_t* dyn, int ninst, int a)
{
    /*if(a+dyn->insts[ninst].n.stack_next-st<0)
        // The STx has been pushed at the end of instructon, so stop going back
        return -1;*/
    for(int i=0; i<24; ++i)
        if(dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F
         && dyn->insts[ninst].n.neoncache[i].n==a)
            return i;
    return -1;
}
int neoncache_get_st_f_i64(dynarec_arm_t* dyn, int ninst, int a)
{
    /*if(a+dyn->insts[ninst].n.stack_next-st<0)
        // The STx has been pushed at the end of instructon, so stop going back
        return -1;*/
    for(int i=0; i<24; ++i)
        if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F)
         && dyn->insts[ninst].n.neoncache[i].n==a)
            return i;
    return -1;
}
int neoncache_get_st_f_noback(dynarec_arm_t* dyn, int ninst, int a)
{
    for(int i=0; i<24; ++i)
        if(dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F
         && dyn->insts[ninst].n.neoncache[i].n==a)
            return i;
    return -1;
}
int neoncache_get_st_f_i64_noback(dynarec_arm_t* dyn, int ninst, int a)
{
    for(int i=0; i<24; ++i)
        if((dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F)
         && dyn->insts[ninst].n.neoncache[i].n==a)
            return i;
    return -1;
}
int neoncache_get_current_st_f(dynarec_arm_t* dyn, int a)
{
    for(int i=0; i<24; ++i)
        if(dyn->n.neoncache[i].t==NEON_CACHE_ST_F
         && dyn->n.neoncache[i].n==a)
            return i;
    return -1;
}
int neoncache_get_current_st_f_i64(dynarec_arm_t* dyn, int a)
{
    for(int i=0; i<24; ++i)
        if((dyn->n.neoncache[i].t==NEON_CACHE_ST_I64 || dyn->n.neoncache[i].t==NEON_CACHE_ST_F)
         && dyn->n.neoncache[i].n==a)
            return i;
    return -1;
}
static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int maxinst, int a);
static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int maxinst, int a);
static void neoncache_promote_double_combined(dynarec_arm_t* dyn, int ninst, int maxinst, int a)
{
    if(a == dyn->insts[ninst].n.combined1 || a == dyn->insts[ninst].n.combined2) {
        if(a == dyn->insts[ninst].n.combined1) {
            a = dyn->insts[ninst].n.combined2;
        } else
            a = dyn->insts[ninst].n.combined1;
        int i = neoncache_get_st_f_i64_noback(dyn, ninst, a);
        //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_combined, ninst=%d combined%c %d i=%d (stack:%d/%d)\n", ninst, (a == dyn->insts[ninst].n.combined2)?'2':'1', a ,i, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop);
        if(i>=0) {
            dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D;
            if(dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
            if(!dyn->insts[ninst].n.barrier)
                neoncache_promote_double_internal(dyn, ninst-1, maxinst, a-dyn->insts[ninst].n.stack_push);
            // go forward is combined is not pop'd
            if(a-dyn->insts[ninst].n.stack_pop>=0)
                if(!((ninst+1<dyn->size) && dyn->insts[ninst+1].n.barrier))
                    neoncache_promote_double_forward(dyn, ninst+1, maxinst, a-dyn->insts[ninst].n.stack_pop);
        }
    }
}
static void neoncache_promote_double_internal(dynarec_arm_t* dyn, int ninst, int maxinst, int a)
{
    while(ninst>=0) {
        a+=dyn->insts[ninst].n.stack_pop;    // adjust Stack depth: add pop'd ST (going backward)
        int i = neoncache_get_st_f_i64(dyn, ninst, a);
        //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d, a=%d st=%d:%d, i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, i);
        if(i<0) return;
        dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D;
        if(dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
        // check combined propagation too
        if(dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) {
            if(dyn->insts[ninst].n.swapped) {
                //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack);
                if (a==dyn->insts[ninst].n.combined1)
                    a = dyn->insts[ninst].n.combined2;
                else if (a==dyn->insts[ninst].n.combined2)
                    a = dyn->insts[ninst].n.combined1;
            } else {
                //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_internal, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack);
                neoncache_promote_double_combined(dyn, ninst, maxinst, a);
            }
        }
        a-=dyn->insts[ninst].n.stack_push;  // // adjust Stack depth: remove push'd ST (going backward)
        --ninst;
        if(ninst<0 || a<0 || dyn->insts[ninst].n.barrier)
            return;
    }
}

static void neoncache_promote_double_forward(dynarec_arm_t* dyn, int ninst, int maxinst, int a)
{
    while((ninst!=-1) && (ninst<maxinst) && (a>=0)) {
        a+=dyn->insts[ninst].n.stack_push;  // // adjust Stack depth: add push'd ST (going forward)
        if((dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) && dyn->insts[ninst].n.swapped) {
            //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d swapped %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack);
            if (a==dyn->insts[ninst].n.combined1)
                a = dyn->insts[ninst].n.combined2;
            else if (a==dyn->insts[ninst].n.combined2)
                a = dyn->insts[ninst].n.combined1;
        }
        int i = neoncache_get_st_f_i64_noback(dyn, ninst, a);
        //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d, a=%d st=%d:%d(%d/%d), i=%d\n", ninst, a, dyn->insts[ninst].n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, -dyn->insts[ninst].n.stack_pop, i);
        if(i<0) return;
        dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D;
        if(dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
        // check combined propagation too
        if((dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) && !dyn->insts[ninst].n.swapped) {
            //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double_forward, ninst=%d combined %d/%d vs %d with st %d\n", ninst, dyn->insts[ninst].n.combined1 ,dyn->insts[ninst].n.combined2, a, dyn->insts[ninst].n.stack);
            neoncache_promote_double_combined(dyn, ninst, maxinst, a);
        }
        a-=dyn->insts[ninst].n.stack_pop;    // adjust Stack depth: remove pop'd ST (going forward)
        if(dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].n.barrier)
            ++ninst;
        else
            ninst=-1;
    }
    if(ninst==maxinst)
        neoncache_promote_double(dyn, ninst, a);
}

void neoncache_promote_double(dynarec_arm_t* dyn, int ninst, int a)
{
    int i = neoncache_get_current_st_f_i64(dyn, a);
    //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d a=%d st=%d i=%d\n", ninst, a, dyn->n.stack, i);
    if(i<0) return;
    dyn->n.neoncache[i].t = NEON_CACHE_ST_D;
    dyn->insts[ninst].n.neoncache[i].t = NEON_CACHE_ST_D;
    if(dyn->insts[ninst].x87precision) dyn->need_x87check = 2;
    // check combined propagation too
    if(dyn->n.combined1 || dyn->n.combined2) {
        if(dyn->n.swapped) {
            //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d swapped! %d/%d vs %d\n", ninst, dyn->n.combined1 ,dyn->n.combined2, a);
            if(dyn->n.combined1 == a)
                a = dyn->n.combined2;
            else if(dyn->n.combined2 == a)
                a = dyn->n.combined1;
        } else {
            //if(dyn->need_dump) dynarec_log(LOG_NONE, "neoncache_promote_double, ninst=%d combined! %d/%d vs %d\n", ninst, dyn->n.combined1 ,dyn->n.combined2, a);
            if(dyn->n.combined1 == a)
                neoncache_promote_double(dyn, ninst, dyn->n.combined2);
            else if(dyn->n.combined2 == a)
                neoncache_promote_double(dyn, ninst, dyn->n.combined1);
        }
    }
    a-=dyn->insts[ninst].n.stack_push;  // // adjust Stack depth: remove push'd ST (going backward)
    if(!ninst || a<0) return;
    neoncache_promote_double_internal(dyn, ninst-1, ninst, a);
}

int neoncache_combine_st(dynarec_arm_t* dyn, int ninst, int a, int b)
{
    dyn->n.combined1=a;
    dyn->n.combined2=b;
    if( neoncache_get_current_st(dyn, ninst, a)==NEON_CACHE_ST_F
     && neoncache_get_current_st(dyn, ninst, b)==NEON_CACHE_ST_F )
        return NEON_CACHE_ST_F;
    // don't combine i64, it's only for load/store
    /*if( neoncache_get_current_st(dyn, ninst, a)==NEON_CACHE_ST_I64
     && neoncache_get_current_st(dyn, ninst, b)==NEON_CACHE_ST_I64 )
        return NEON_CACHE_ST_I64;*/
    return NEON_CACHE_ST_D;
}

static int isCacheEmpty(dynarec_native_t* dyn, int ninst) {
    if(dyn->insts[ninst].n.stack_next) {
        return 0;
    }
    for(int i=0; i<24; ++i)
        if(dyn->insts[ninst].n.neoncache[i].v) {       // there is something at ninst for i
            if(!(
            (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F
             || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D
             || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64)
            && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop))
                return 0;
        }
    return 1;

}

int fpuCacheNeedsTransform(dynarec_arm_t* dyn, int ninst) {
    int i2 = dyn->insts[ninst].x64.jmp_insts;
    if(i2<0)
        return 1;
    if((dyn->insts[i2].x64.barrier&BARRIER_FLOAT))
        // if the barrier as already been apply, no transform needed
        return ((dyn->insts[ninst].x64.barrier&BARRIER_FLOAT))?0:(isCacheEmpty(dyn, ninst)?0:1);
    int ret = 0;
    if(!i2) { // just purge
        if(dyn->insts[ninst].n.stack_next)
            return 1;
        if(dyn->insts[ninst].ymm0_out)
            return 1;
        for(int i=0; i<32 && !ret; ++i)
            if(dyn->insts[ninst].n.neoncache[i].v) {       // there is something at ninst for i
                if(!(
                (dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_F
                || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_D
                || dyn->insts[ninst].n.neoncache[i].t==NEON_CACHE_ST_I64)
                && dyn->insts[ninst].n.neoncache[i].n<dyn->insts[ninst].n.stack_pop))
                    ret = 1;
            }
        return ret;
    }
    // Check if ninst can be compatible to i2
    if(dyn->insts[ninst].n.stack_next != dyn->insts[i2].n.stack-dyn->insts[i2].n.stack_push) {
        return 1;
    }
    if(dyn->insts[ninst].ymm0_out && (dyn->insts[ninst].ymm0_out&~dyn->insts[i2].ymm0_in))
        return 1;
    neoncache_t cache_i2 = dyn->insts[i2].n;
    neoncacheUnwind(&cache_i2);

    for(int i=0; i<32; ++i) {
        if(dyn->insts[ninst].n.neoncache[i].v) {       // there is something at ninst for i
            if(!cache_i2.neoncache[i].v) {    // but there is nothing at i2 for i
                ret = 1;
            } else if(dyn->insts[ninst].n.neoncache[i].v!=cache_i2.neoncache[i].v) {  // there is something different
                if(dyn->insts[ninst].n.neoncache[i].n!=cache_i2.neoncache[i].n) {   // not the same x64 reg
                    ret = 1;
                }
                else if(dyn->insts[ninst].n.neoncache[i].t == NEON_CACHE_XMMR && cache_i2.neoncache[i].t == NEON_CACHE_XMMW)
                    {/* nothing */ }
                else if(dyn->insts[ninst].n.neoncache[i].t == NEON_CACHE_YMMR && cache_i2.neoncache[i].t == NEON_CACHE_YMMW)
                    {/* nothing */ }
                else
                    ret = 1;
            }
        } else if(cache_i2.neoncache[i].v)
            ret = 1;
    }
    return ret;
}

void neoncacheUnwind(neoncache_t* cache)
{
    if(cache->swapped) {
        // unswap
        int a = -1;
        int b = -1;
        // in neoncache
        for(int j=0; j<24 && ((a==-1) || (b==-1)); ++j)
            if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) {
                if(cache->neoncache[j].n == cache->combined1)
                    a = j;
                else if(cache->neoncache[j].n == cache->combined2)
                    b = j;
            }
        if(a!=-1 && b!=-1) {
            int tmp = cache->neoncache[a].n;
            cache->neoncache[a].n = cache->neoncache[b].n;
            cache->neoncache[b].n = tmp;
        }
        // done
        cache->swapped = 0;
        cache->combined1 = cache->combined2 = 0;
    }
    if(cache->news) {
        // remove the newly created neoncache
        for(int i=0; i<32; ++i)
            if(cache->news&(1<<i))
                cache->neoncache[i].v = 0;
        cache->news = 0;
    }
    if(cache->stack_push) {
        // unpush
        for(int j=0; j<24; ++j) {
            if((cache->neoncache[j].t == NEON_CACHE_ST_D || cache->neoncache[j].t == NEON_CACHE_ST_F || cache->neoncache[j].t == NEON_CACHE_ST_I64)) {
                if(cache->neoncache[j].n<cache->stack_push)
                    cache->neoncache[j].v = 0;
                else
                    cache->neoncache[j].n-=cache->stack_push;
            }
        }
        cache->x87stack-=cache->stack_push;
        cache->tags>>=(cache->stack_push*2);
        cache->stack-=cache->stack_push;
        if(cache->pushed>=cache->stack_push)
            cache->pushed-=cache->stack_push;
        else
            cache->pushed = 0;
        cache->stack_push = 0;
    }
    cache->x87stack+=cache->stack_pop;
    cache->stack_next = cache->stack;
    if(cache->stack_pop) {
        if(cache->poped>=cache->stack_pop)
            cache->poped-=cache->stack_pop;
        else
            cache->poped = 0;
        cache->tags<<=(cache->stack_pop*2);
    }
    cache->stack_pop = 0;
    cache->barrier = 0;
    // And now, rebuild the x87cache info with neoncache
    cache->mmxcount = 0;
    cache->fpu_scratch = 0;
    cache->fpu_reg = 0;
    for(int i=0; i<8; ++i) {
        cache->x87cache[i] = -1;
        cache->mmxcache[i] = -1;
        cache->x87reg[i] = 0;
        cache->ssecache[i*2].v = -1;
        cache->ssecache[i*2+1].v = -1;
    }
    int x87reg = 0;
    for(int i=0; i<32; ++i) {
        if(cache->neoncache[i].v) {
            cache->fpuused[i] = 1;
            switch (cache->neoncache[i].t) {
                case NEON_CACHE_MM:
                    cache->mmxcache[cache->neoncache[i].n] = i;
                    ++cache->mmxcount;
                    ++cache->fpu_reg;
                    break;
                case NEON_CACHE_XMMR:
                case NEON_CACHE_XMMW:
                    cache->ssecache[cache->neoncache[i].n].reg = i;
                    cache->ssecache[cache->neoncache[i].n].write = (cache->neoncache[i].t==NEON_CACHE_XMMW)?1:0;
                    ++cache->fpu_reg;
                    break;
                case NEON_CACHE_YMMR:
                case NEON_CACHE_YMMW:
                    cache->fpuused[i] = 0;  // YMM does not mark the fpu reg as used
                    break;
                case NEON_CACHE_ST_F:
                case NEON_CACHE_ST_D:
                case NEON_CACHE_ST_I64:
                    cache->x87cache[x87reg] = cache->neoncache[i].n;
                    cache->x87reg[x87reg] = i;
                    ++x87reg;
                    ++cache->fpu_reg;
                    break;
                case NEON_CACHE_SCR:
                    cache->fpuused[i] = 0;
                    cache->neoncache[i].v = 0;
                    break;
            }
        } else {
            cache->fpuused[i] = 0;
        }
    }
    // add back removed XMM
    if(cache->xmm_removed) {
        for(int i=0; i<16; ++i)
            if(cache->xmm_removed&(1<<i)) {
                int reg = (i<8)?(XMM0+i):(XMM8+i-8);
                cache->neoncache[reg].t = (cache->xmm_write&(1<<i))?NEON_CACHE_XMMW:NEON_CACHE_XMMR;
                cache->neoncache[reg].n = i;
            }
        cache->xmm_write = cache->xmm_removed = 0;
    }
        // add back removed YMM
    if(cache->ymm_removed) {
        for(int i=0; i<16; ++i)
            if(cache->ymm_removed&(1<<i)) {
                int reg = cache->ymm_regs>>(i*4)&15;
                if(reg>7)
                    reg = reg - 8 + SCRATCH0;
                else
                    reg = reg + EMM0;
                //if(cache->neoncache[reg].v)   // this is normal when a ymm is purged to make space for another one
                //    printf_log(LOG_INFO, "Warning, recreating YMM%d on non empty slot %s", i, getCacheName(cache->neoncache[reg].t, cache->neoncache[reg].n));
                cache->neoncache[reg].t = (cache->ymm_write&(1<<i))?NEON_CACHE_YMMW:NEON_CACHE_YMMR;
                cache->neoncache[reg].n = i;
            }
        cache->ymm_regs = 0;
        cache->ymm_write = cache->ymm_removed = 0;
    }
    cache->ymm_used = 0;
}

#define F8      *(uint8_t*)(addr++)
#define F32S64  (uint64_t)(int64_t)*(int32_t*)(addr+=4, addr-4)
// Get if ED will have the correct parity. Not emitting anything. Parity is 2 for DWORD or 3 for QWORD
int getedparity(dynarec_arm_t* dyn, int ninst, uintptr_t addr, uint8_t nextop, int parity, int delta)
{
    (void)dyn; (void)ninst;

    uint32_t tested = (1<<parity)-1;
    if((nextop&0xC0)==0xC0)
        return 0;   // direct register, no parity...
    if(!(nextop&0xC0)) {
        if((nextop&7)==4) {
            uint8_t sib = F8;
            int sib_reg = (sib>>3)&7;
            if((sib&0x7)==5) {
                uint64_t tmp = F32S64;
                if (sib_reg!=4) {
                    // if XXXXXX+reg<<N then check parity of XXXXX and N should be enough
                    return ((tmp&tested)==0 && (sib>>6)>=parity)?1:0;
                } else {
                    // just a constant...
                    return (tmp&tested)?0:1;
                }
            } else {
                if(sib_reg==4 && parity<3)
                    return 0;   // simple [reg]
                // don't try [reg1 + reg2<<N], unless reg1 is ESP
                return ((sib&0x7)==4 && (sib>>6)>=parity)?1:0;
            }
        } else if((nextop&7)==5) {
            uint64_t tmp = F32S64;
            tmp+=addr+delta;
            return (tmp&tested)?0:1;
        } else {
            return 0;
        }
    } else {
        return 0; //Form [reg1 + reg2<<N + XXXXXX]
    }
}
#undef F8
#undef F32S64

const char* getCacheName(int t, int n)
{
    static char buff[20];
    switch(t) {
        case NEON_CACHE_ST_D: sprintf(buff, "ST%d", n); break;
        case NEON_CACHE_ST_F: sprintf(buff, "st%d", n); break;
        case NEON_CACHE_ST_I64: sprintf(buff, "STi%d", n); break;
        case NEON_CACHE_MM: sprintf(buff, "MM%d", n); break;
        case NEON_CACHE_XMMW: sprintf(buff, "XMM%d", n); break;
        case NEON_CACHE_XMMR: sprintf(buff, "xmm%d", n); break;
        case NEON_CACHE_YMMW: sprintf(buff, "YMM%d", n); break;
        case NEON_CACHE_YMMR: sprintf(buff, "ymm%d", n); break;
        case NEON_CACHE_SCR: sprintf(buff, "Scratch"); break;
        case NEON_CACHE_NONE: buff[0]='\0'; break;
    }
    return buff;
}

static register_mapping_t register_mappings[] = {
    { "rax", "x10" },
    { "eax", "w10" },
    { "ax", "x10" },
    { "ah", "x10" },
    { "al", "x10" },
    { "rcx", "x11" },
    { "ecx", "w11" },
    { "cx", "x11" },
    { "ch", "x11" },
    { "cl", "x11" },
    { "rdx", "x12" },
    { "edx", "w12" },
    { "dx", "x12" },
    { "dh", "x12" },
    { "dl", "x12" },
    { "rbx", "x13" },
    { "ebx", "w13" },
    { "bx", "x13" },
    { "bh", "x13" },
    { "bl", "x13" },
    { "rsi", "x14" },
    { "esi", "w14" },
    { "si", "x14" },
    { "sil", "x14" },
    { "rdi", "x15" },
    { "edi", "w15" },
    { "di", "x15" },
    { "dil", "x15" },
    { "rsp", "x16" },
    { "esp", "w16" },
    { "sp", "x16" },
    { "spl", "x16" },
    { "rbp", "x17" },
    { "ebp", "w17" },
    { "bp", "x17" },
    { "bpl", "x17" },
    { "r8", "x18" },
    { "r8d", "w18" },
    { "r8w", "x18" },
    { "r8b", "x18" },
    { "r9", "x19" },
    { "r9d", "w19" },
    { "r9w", "x19" },
    { "r9b", "x19" },
    { "r10", "x20" },
    { "r10d", "w20" },
    { "r10w", "x20" },
    { "r10b", "x20" },
    { "r11", "x21" },
    { "r11d", "w21" },
    { "r11w", "x21" },
    { "r11b", "x21" },
    { "r12", "x22" },
    { "r12d", "w22" },
    { "r12w", "x22" },
    { "r12b", "x22" },
    { "r13", "x23" },
    { "r13d", "w23" },
    { "r13w", "x23" },
    { "r13b", "x23" },
    { "r14", "x24" },
    { "r14d", "w24" },
    { "r14w", "x24" },
    { "r14b", "x24" },
    { "r15", "x25" },
    { "r15d", "w25" },
    { "r15w", "x25" },
    { "r15b", "x25" },
    { "rip", "x27" },
};

void printf_x64_instruction(dynarec_native_t* dyn, zydis_dec_t* dec, instruction_x64_t* inst, const char* name);
void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex)
{
    if (!dyn->need_dump && !BOX64ENV(dynarec_gdbjit) && !BOX64ENV(dynarec_perf_map)) return;

    static char buf[256];
    int length = sprintf(buf, "barrier=%d state=%d/%d/%d(%d:%d->%d:%d), %s=%X/%X, use=%X, need=%X/%X, sm=%d(%d/%d)",
        dyn->insts[ninst].x64.barrier,
        dyn->insts[ninst].x64.state_flags,
        dyn->f.pending,
        dyn->f.dfnone,
        dyn->insts[ninst].f_entry.pending,
        dyn->insts[ninst].f_entry.dfnone,
        dyn->insts[ninst].f_exit.pending,
        dyn->insts[ninst].f_exit.dfnone,
        dyn->insts[ninst].x64.may_set ? "may" : "set",
        dyn->insts[ninst].x64.set_flags,
        dyn->insts[ninst].x64.gen_flags,
        dyn->insts[ninst].x64.use_flags,
        dyn->insts[ninst].x64.need_before,
        dyn->insts[ninst].x64.need_after,
        dyn->smwrite, dyn->insts[ninst].will_write, dyn->insts[ninst].last_write);
    if (dyn->insts[ninst].nat_flags_op) {
        if (dyn->insts[ninst].nat_flags_op == NAT_FLAG_OP_TOUCH && dyn->insts[ninst].before_nat_flags)
            length += sprintf(buf + length, " NF:%d/read:%x", dyn->insts[ninst].nat_flags_op, dyn->insts[ninst].before_nat_flags);
        else
            length += sprintf(buf + length, " NF:%d", dyn->insts[ninst].nat_flags_op);
    }
    if (dyn->insts[ninst].use_nat_flags || dyn->insts[ninst].set_nat_flags || dyn->insts[ninst].need_nat_flags) {
        length += sprintf(buf + length, " nf:%hhx/%hhx/%hhx", dyn->insts[ninst].set_nat_flags, dyn->insts[ninst].use_nat_flags, dyn->insts[ninst].need_nat_flags);
    }
    if (dyn->insts[ninst].invert_carry)
        length += sprintf(buf + length, " CI");
    if (dyn->insts[ninst].gen_inverted_carry)
        length += sprintf(buf + length, " gic");
    if (dyn->insts[ninst].before_nat_flags & NF_CF) {
        length += sprintf(buf + length, " %ccb", dyn->insts[ninst].normal_carry_before ? 'n' : 'i');
    }
    if (dyn->insts[ninst].need_nat_flags & NF_CF) {
        length += sprintf(buf + length, " %cc", dyn->insts[ninst].normal_carry ? 'n' : 'i');
    }
    if (dyn->insts[ninst].pred_sz) {
        length += sprintf(buf + length, ", pred=");
        for (int ii = 0; ii < dyn->insts[ninst].pred_sz; ++ii)
            length += sprintf(buf + length, "%s%d", ii ? "/" : "", dyn->insts[ninst].pred[ii]);
    }
    if (!dyn->insts[ninst].x64.alive)
        length += sprintf(buf + length, "not executed");
    if (dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts >= 0) {
        length += sprintf(buf + length, ", jmp=%d", dyn->insts[ninst].x64.jmp_insts);
    }
    if (dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts == -1)
        length += sprintf(buf + length, ", jmp=out");
    if (dyn->insts[ninst].x64.has_callret)
        length += sprintf(buf + length, ", callret");
    if (dyn->last_ip) {
        length += sprintf(buf + length, ", last_ip=%p", (void*)dyn->last_ip);
    }
    for (int ii = 0; ii < 32; ++ii) {
        switch (dyn->insts[ninst].n.neoncache[ii].t) {
            case NEON_CACHE_ST_D: length += sprintf(buf + length, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_ST_F: length += sprintf(buf + length, " S%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_ST_I64: length += sprintf(buf + length, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_MM: length += sprintf(buf + length, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_XMMW: length += sprintf(buf + length, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_XMMR: length += sprintf(buf + length, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_YMMW: length += sprintf(buf + length, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_YMMR: length += sprintf(buf + length, " Q%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            // case NEON_CACHE_SCR: length += sprintf(buf + length, " D%d:%s", ii, getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n)); break;
            case NEON_CACHE_NONE:
            default: break;
        }
    }
    if (memcmp(dyn->insts[ninst].n.neoncache, dyn->n.neoncache, sizeof(dyn->n.neoncache))) {
        length += sprintf(buf + length, " %s(Change:", (dyn->need_dump > 1) ? "\e[1;91m" : "");
        for (int ii = 0; ii < 32; ++ii)
            if (dyn->insts[ninst].n.neoncache[ii].v != dyn->n.neoncache[ii].v) {
                length += sprintf(buf + length, " V%d:%s", ii, getCacheName(dyn->n.neoncache[ii].t, dyn->n.neoncache[ii].n));
                length += sprintf(buf + length, "->%s", getCacheName(dyn->insts[ninst].n.neoncache[ii].t, dyn->insts[ninst].n.neoncache[ii].n));
            }
        length += sprintf(buf + length, ")%s", (dyn->need_dump > 1) ? "\e[0;32m" : "");
    }
    if (dyn->insts[ninst].n.ymm_used) {
        length += sprintf(buf + length, " ymmUsed=%04x", dyn->insts[ninst].n.ymm_used);
    }
    if (dyn->ymm_zero || dyn->insts[ninst].ymm0_add || dyn->insts[ninst].ymm0_sub || dyn->insts[ninst].ymm0_out) {
        length += sprintf(buf + length, " ymm0=(%04x/%04x+%04x-%04x=%04x)", dyn->ymm_zero, dyn->insts[ninst].ymm0_in, dyn->insts[ninst].ymm0_add, dyn->insts[ninst].ymm0_sub, dyn->insts[ninst].ymm0_out);
    }
    if (dyn->insts[ninst].purge_ymm) {
        length += sprintf(buf + length, " purgeYmm=%04x", dyn->insts[ninst].purge_ymm);
    }
    if (dyn->n.stack || dyn->insts[ninst].n.stack_next || dyn->insts[ninst].n.x87stack) {
        length += sprintf(buf + length, " X87:%d/%d(+%d/-%d)%d", dyn->n.stack, dyn->insts[ninst].n.stack_next, dyn->insts[ninst].n.stack_push, dyn->insts[ninst].n.stack_pop, dyn->insts[ninst].n.x87stack);
    }
    if (dyn->insts[ninst].n.combined1 || dyn->insts[ninst].n.combined2) {
        length += sprintf(buf + length, " %s:%d/%d", dyn->insts[ninst].n.swapped ? "SWP" : "CMB", dyn->insts[ninst].n.combined1, dyn->insts[ninst].n.combined2);
    }
    if (dyn->need_dump) {
        printf_x64_instruction(dyn, rex.is32bits ? my_context->dec32 : my_context->dec, &dyn->insts[ninst].x64, name);
        dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, %s%s\n",
            (dyn->need_dump > 1) ? "\e[32m" : "",
            (void*)(dyn->native_start + dyn->insts[ninst].address), dyn->insts[ninst].size / 4, ninst, buf, (dyn->need_dump > 1) ? "\e[m" : "");
    }
    if (BOX64ENV(dynarec_gdbjit)) {
        static char buf2[512];
        if (BOX64ENV(dynarec_gdbjit) > 1) {
            sprintf(buf2, "; %d: %d opcodes, %s", ninst, dyn->insts[ninst].size / 4, buf);
            dyn->gdbjit_block = GdbJITBlockAddLine(dyn->gdbjit_block, (dyn->native_start + dyn->insts[ninst].address), buf2);
        }
        zydis_dec_t* dec = rex.is32bits ? my_context->dec32 : my_context->dec;
        const char* inst_name = name;
        if (dec) {
            inst_name = DecodeX64Trace(dec, dyn->insts[ninst].x64.addr, 0);
            x64disas_add_register_mapping_annotations(buf2, inst_name, register_mappings, sizeof(register_mappings) / sizeof(register_mappings[0]));
            inst_name = buf2;
        }
        dyn->gdbjit_block = GdbJITBlockAddLine(dyn->gdbjit_block, (dyn->native_start + dyn->insts[ninst].address), inst_name);
    }
    if (BOX64ENV(dynarec_perf_map) && BOX64ENV(dynarec_perf_map_fd) != -1) {
        writePerfMap(dyn->insts[ninst].x64.addr, dyn->native_start + dyn->insts[ninst].address, dyn->insts[ninst].size / 4, name);
    }
}

void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode)
{
    dynarec_log_prefix(0, LOG_NONE, "\t%08x\t%s\n", opcode, arm64_print(opcode, (uintptr_t)dyn->block));
}

static void x87_reset(neoncache_t* n)
{
    for (int i=0; i<8; ++i)
        n->x87cache[i] = -1;
    n->tags = 0;
    n->x87stack = 0;
    n->stack = 0;
    n->stack_next = 0;
    n->stack_pop = 0;
    n->stack_push = 0;
    n->combined1 = n->combined2 = 0;
    n->swapped = 0;
    n->barrier = 0;
    n->pushed = 0;
    n->poped = 0;

    for(int i=0; i<24; ++i)
        if(n->neoncache[i].t == NEON_CACHE_ST_F
         || n->neoncache[i].t == NEON_CACHE_ST_D
         || n->neoncache[i].t == NEON_CACHE_ST_I64)
            n->neoncache[i].v = 0;
}

static void mmx_reset(neoncache_t* n)
{
    n->mmxcount = 0;
    for (int i=0; i<8; ++i)
        n->mmxcache[i] = -1;
}

static void sse_reset(neoncache_t* n)
{
    for (int i=0; i<16; ++i)
        n->ssecache[i].v = -1;
    for (int i=0; i<32; ++i)
        if(n->neoncache[i].t==NEON_CACHE_YMMR || n->neoncache[i].t==NEON_CACHE_YMMW)
            n->neoncache[i].v = 0;
}

void fpu_reset(dynarec_native_t* dyn)
{
    x87_reset(&dyn->n);
    mmx_reset(&dyn->n);
    sse_reset(&dyn->n);
    fpu_reset_reg(dyn);
    dyn->ymm_zero = 0;
}

void fpu_reset_ninst(dynarec_native_t* dyn, int ninst)
{
    x87_reset(&dyn->insts[ninst].n);
    mmx_reset(&dyn->insts[ninst].n);
    sse_reset(&dyn->insts[ninst].n);
    fpu_reset_reg_neoncache(&dyn->insts[ninst].n);

}

int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st)
{
    return (dyn->n.tags&(0b11<<(st*2)))?1:0;
}


uint8_t mark_natflag(dynarec_arm_t* dyn, int ninst, uint8_t flag, int before)
{
    if(dyn->insts[ninst].x64.set_flags && !before) {
        dyn->insts[ninst].set_nat_flags |= flag;
        //if(dyn->insts[ninst].x64.use_flags) {
        //    dyn->insts[ninst].use_nat_flags |= flag;
        //}
    } else {
        if(before)
            dyn->insts[ninst].use_nat_flags_before |= flag;
        else
            dyn->insts[ninst].use_nat_flags |= flag;
    }
    return flag;
}

uint8_t flag2native(uint8_t flags)
{
    uint8_t ret = 0;
    #ifdef ARM64
    if(flags&X_ZF) ret|=NF_EQ;
    if(flags&X_SF) ret|=NF_SF;
    if(flags&X_OF) ret|=NF_VF;
    if(flags&X_CF) ret|=NF_CF;
    #else
    // no native flags on rv64 or la64
    #endif
    return ret;
}

int flagIsNative(uint8_t flags)
{
    if(flags&(X_AF|X_PF)) return 0;
    return 1;
}

static uint8_t getNativeFlagsUsed(dynarec_arm_t* dyn, int start, uint8_t flags)
{
    // propagate and check wich flags are actually used
    uint8_t used_flags = 0;
    int nat_flags_used = 0;
    int ninst = start;
    while(ninst<dyn->size) {
//printf_log(LOG_INFO, "getNativeFlagsUsed ninst:%d/%d, flags=%x, used_flags=%x(%d), nat_flags_op_before:%x, nat_flags_op:%x, need_after:%x set_nat_flags:%x nat_flags_used:%x(%x)\n", ninst, start, flags, used_flags, nat_flags_used, dyn->insts[ninst].nat_flags_op_before, dyn->insts[ninst].nat_flags_op, flag2native(dyn->insts[ninst].x64.need_after),dyn->insts[ninst].set_nat_flags, dyn->insts[ninst].use_nat_flags, dyn->insts[ninst].use_nat_flags_before);
        // check if this is an opcode that generate flags but consume flags before
        if(dyn->insts[ninst].nat_flags_op_before)
            return 0;
        // check if nat flags are used "before"
        if(dyn->insts[ninst].use_nat_flags_before) {
            // check if the gen flags are compatible
            if(dyn->insts[ninst].use_nat_flags_before&~flags)
                return 0;
            nat_flags_used = 1;
            used_flags|=dyn->insts[ninst].use_nat_flags_before&flags;
        }
        // if the opcode generate flags, return
        if(dyn->insts[ninst].nat_flags_op==NAT_FLAG_OP_TOUCH && (start!=ninst)) {
            if(!nat_flags_used)
                return 0;
            if(used_flags&~dyn->insts[ninst].set_nat_flags) {
                // check partial changes that would destroy flag state
                if(dyn->insts[ninst].use_nat_flags_before&flags)
                    return used_flags;
                // check if flags are all refreshed, then it's ok
                if((used_flags&dyn->insts[ninst].set_nat_flags)==used_flags)
                    return used_flags;
                // incompatible
                return 0;
            }
            return used_flags;
        }
        // check if there is a callret barrier
        if(dyn->insts[ninst].x64.has_callret)
            return 0;
        // check if nat flags are used
        if(dyn->insts[ninst].use_nat_flags) {
            // check if the gen flags are compatible
            if(dyn->insts[ninst].use_nat_flags&~flags)
                return 0;
            nat_flags_used = 1;
            used_flags  |= dyn->insts[ninst].use_nat_flags&flags;
        }
        if(ninst!=start && dyn->insts[ninst].x64.use_flags) {
            // some flags not compatible with native, partial use not allowed
            if(flag2native(dyn->insts[ninst].x64.use_flags)!=dyn->insts[ninst].use_nat_flags)
                return 0;
            // check if flags are used, but not the natives ones
            //if(dyn->insts[ninst].use_nat_flags&~used_flags)
            //    return 0;
        }
        // check if flags are generated without native option
        if((start!=ninst) && dyn->insts[ninst].x64.gen_flags && (flag2native(dyn->insts[ninst].x64.gen_flags&dyn->insts[ninst].x64.need_after)&used_flags)) {
            if(used_flags&~flag2native(dyn->insts[ninst].x64.gen_flags&dyn->insts[ninst].x64.need_after))
                return 0;   // partial covert, not supported for now (TODO: this might be fixable)
            else
                return nat_flags_used?used_flags:0;  // full covert... End of propagation
        }
        // check if flags are still needed
        if(!(flag2native(dyn->insts[ninst].x64.need_after)&flags))
            return nat_flags_used?used_flags:0;
        // check if flags are destroyed, cancel the use then
        if(dyn->insts[ninst].nat_flags_op && (start!=ninst))
            return 0;
        // update used flags
        //used_flags |= (flag2native(dyn->insts[ninst].x64.need_after)&flags);

        // go next
        if(!dyn->insts[ninst].x64.has_next) {
            // check if it's a jump to an opcode with only 1 preds, then just follow the jump
            int jmp = dyn->insts[ninst].x64.jmp_insts;
            if(dyn->insts[ninst].x64.jmp && (jmp!=-1) && (getNominalPred(dyn, jmp)==ninst))
                ninst = jmp;
            else
                return nat_flags_used?used_flags:0;
        } else
            ++ninst;
    }
    return nat_flags_used?used_flags:0;
}

static void propagateNativeFlags(dynarec_arm_t* dyn, int start)
{
    int ninst = start;
    // those are the flags generated by the opcode and used later on
    uint8_t flags = dyn->insts[ninst].set_nat_flags&flag2native(dyn->insts[ninst].x64.need_after);
    //check if they are actualy used before starting
//printf_log(LOG_INFO, "propagateNativeFlags called for start=%d, flags=%x, will need:%x\n", start, flags, flag2native(dyn->insts[ninst].x64.need_after));
    if(!flags) return;
    // also check if some native flags are used but not genereated here
    if(flag2native(dyn->insts[ninst].x64.use_flags)&~flags) return;
    uint8_t used_flags = getNativeFlagsUsed(dyn, start, flags);
//printf_log(LOG_INFO, " will use:%x, carry:%d, generate inverted carry:%d\n", used_flags, used_flags&NF_CF, dyn->insts[ninst].gen_inverted_carry);
    if(!used_flags) return; // the flags wont be used, so just cancel
    int nc = dyn->insts[ninst].gen_inverted_carry?0:1;
    int carry = used_flags&NF_CF;
    // propagate
    while(ninst<dyn->size) {
        // check if this is an opcode that generate flags but consume flags before
        if((start!=ninst) && dyn->insts[ninst].nat_flags_op==NAT_FLAG_OP_TOUCH) {
            if(dyn->insts[ninst].use_nat_flags_before) {
                dyn->insts[ninst].before_nat_flags |= used_flags;
                if(carry) dyn->insts[ninst].normal_carry_before = nc;
            }
            // if the opcode generate flags, return
            return;
        }
        // check if flags are generated without native option
        if((start!=ninst) && dyn->insts[ninst].x64.gen_flags && (flag2native(dyn->insts[ninst].x64.gen_flags&dyn->insts[ninst].x64.need_after)&used_flags))
            return;
        // mark the opcode
        uint8_t use_flags = flag2native(dyn->insts[ninst].x64.need_before|dyn->insts[ninst].x64.need_after);
        if(dyn->insts[ninst].x64.use_flags) use_flags |= flag2native(dyn->insts[ninst].x64.use_flags);  // should not change anything
//printf_log(LOG_INFO, " marking ninst=%d with %x | %x&%x => %x\n", ninst, dyn->insts[ninst].need_nat_flags, used_flags, use_flags, dyn->insts[ninst].need_nat_flags | (used_flags&use_flags));
        dyn->insts[ninst].need_nat_flags |= used_flags&use_flags;
        if(carry) dyn->insts[ninst].normal_carry = nc;
        if(carry && dyn->insts[ninst].invert_carry) nc = 0;
        // check if flags are still needed
        if(!(flag2native(dyn->insts[ninst].x64.need_after)&used_flags))
            return;
        // go next
        if(!dyn->insts[ninst].x64.has_next) {
            // check if it's a jump to an opcode with only 1 preds, then just follow the jump
            int jmp = dyn->insts[ninst].x64.jmp_insts;
            if(dyn->insts[ninst].x64.jmp && (jmp!=-1) && (getNominalPred(dyn, jmp)==ninst))
                ninst = jmp;
            else
                return;
        } else
            ++ninst;
    }
}

void updateNativeFlags(dynarec_native_t* dyn)
{
    if(!BOX64ENV(dynarec_nativeflags))
        return;
    // forward check if native flags are used
    for(int ninst=0; ninst<dyn->size; ++ninst)
        if(flag2native(dyn->insts[ninst].x64.gen_flags) && (dyn->insts[ninst].nat_flags_op==NAT_FLAG_OP_TOUCH)) {
            propagateNativeFlags(dyn, ninst);
        }
}

void rasNativeState(dynarec_arm_t* dyn, int ninst)
{
    dyn->insts[ninst].nat_flags_op = dyn->insts[ninst].set_nat_flags = dyn->insts[ninst].use_nat_flags = dyn->insts[ninst].need_nat_flags = 0;
}

int nativeFlagsNeedsTransform(dynarec_arm_t* dyn, int ninst)
{
    int jmp = dyn->insts[ninst].x64.jmp_insts;
    if(jmp<0)
        return 0;
    if(!dyn->insts[ninst].x64.need_after || !dyn->insts[jmp].x64.need_before)
        return 0;
    if(dyn->insts[ninst].set_nat_flags)
        return 0;
    uint8_t flags_before = dyn->insts[ninst].need_nat_flags;
    uint8_t nc_before = dyn->insts[ninst].normal_carry;
    if(dyn->insts[ninst].invert_carry)
        nc_before = 0;
    uint8_t flags_after = dyn->insts[jmp].need_nat_flags;
    uint8_t nc_after = dyn->insts[jmp].normal_carry;
    if(dyn->insts[jmp].nat_flags_op==NAT_FLAG_OP_TOUCH) {
        flags_after = dyn->insts[jmp].before_nat_flags;
        nc_after = dyn->insts[jmp].normal_carry_before;
    }
    uint8_t flags_x86 = flag2native(dyn->insts[jmp].x64.need_before);
    flags_x86 &= ~flags_after;
    if((flags_before&NF_CF) && (flags_after&NF_CF) && (nc_before!=nc_after))
        return 1;
    // all flags_after should be present and none remaining flags_x86
    if(((flags_before&flags_after)!=flags_after) || (flags_before&flags_x86))
        return 1;
    return 0;
}

void fpu_save_and_unwind(dynarec_arm_t* dyn, int ninst, neoncache_t* cache)
{
    memcpy(cache, &dyn->insts[ninst].n, sizeof(neoncache_t));
    neoncacheUnwind(&dyn->insts[ninst].n);
}
void fpu_unwind_restore(dynarec_arm_t* dyn, int ninst, neoncache_t* cache)
{
    memcpy(&dyn->insts[ninst].n, cache, sizeof(neoncache_t));
}