Skip to content

Commit

Permalink
[ARM64_DYNAREC] Fixed/Improved D9 E5 opcode and add hack for FLD1/FLD…
Browse files Browse the repository at this point in the history
…1/FPATAN sequence
  • Loading branch information
ptitSeb committed Jan 27, 2024
1 parent 28ef6cd commit 124672b
Showing 1 changed file with 85 additions and 6 deletions.
91 changes: 85 additions & 6 deletions src/dynarec/arm64/dynarec_arm64_d9.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
uint8_t ed;
uint8_t wback, wb1;
uint8_t u8;
int64_t fixedaddress;
int64_t fixedaddress, j64;
int unscaled;
int v1, v2;
int s0;
Expand Down Expand Up @@ -131,18 +131,97 @@ uintptr_t dynarec64_D9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nin
break;
case 0xE5:
INST_NAME("FXAM");
#if 1
i1 = x87_get_current_cache(dyn, ninst, 0, NEON_CACHE_ST_D);
// value put in x14
if(i1==-1) {
// not in cache, so check Empty status and load it
i2 = -dyn->n.x87stack;
LDRw_U12(x3, xEmu, offsetof(x64emu_t, fpu_stack));
if(i2) {
if(i2<0) {
ADDw_U12(x3, x3, -i2);
} else {
SUBw_U12(x3, x3, i2);
}
}
CMPSw_U12(x3, 0);
MOV32w(x3, 0b100000100000000);
CSELx(x4, x3, x4, cLE); // empty: C3,C2,C0 = 101
B_MARK3(cLE);
// x4 will be the actual top
LDRw_U12(x4, xEmu, offsetof(x64emu_t, top));
if(i2) {
if(i2<0) {
SUBw_U12(x4, x4, -i2);
} else {
ADDw_U12(x4, x4, i2);
}
ANDw_mask(x4, x4, 0, 3); // (emu->top + i)&7
}
ADDx_REG_LSL(x1, xEmu, x4, 3);
LDRx_U12(x2, x1, offsetof(x64emu_t, x87)); // load x2 with ST0 anyway, for sign extraction
} else {
// simply move from cache reg to x2
v1 = dyn->n.x87reg[i1];
VMOVQDto(x2, v1, 0);
}
// get exponant in x1
LSRx_IMM(x1, x2, 20+32);
ANDSx_mask(x1, x1, 1, 0b00000, 0b001010); // 0x7ff
B_MARK(cNE); // not zero or denormal
ANDx_mask(x1, x3, 1, 0, 0b111110); // 0x7fffffffffffffff
ORRx_REG(x1, x1, x2);
MOV32w(x4, 0b100000000000000); // Zero: C3,C2,C0 = 100
MOV32w(x5, 0b100010000000000); // Denormal: C3,C2,C0 = 110
CSELx(x4, x4, x5, cEQ);
B_MARK3(c__);
MARK;
ORRx_mask(x4, xZR, 1, 0b001100, 0b001010); // 0x7ff0000000000000
CMPSx_REG(x2, x4); // infinite/NaN?
MOV32w(x5, 0b000010000000000); // normal: C3,C2,C0 = 010
CSELx(x4, x5, x4, cNE);
B_MARK3(cNE);
TSTx_mask(x2, 1, 0b000000, 0b110011); // 0x000fffffffffffff
MOV32w(x4, 0b000010100000000); // infinity: C3,C2,C0 = 011
MOV32w(x3, 0b000000100000000); // NaN: C3,C2,C0 = 001
CSELx(x4, x4, x3, cEQ);
MARK3;
// Extract signa & Update SW
LSRx_IMM(x1, x2, 63);
BFIx(x4, x1, 9, 1); //C1
LDRH_U12(x1, xEmu, offsetof(x64emu_t, sw));
MOV32w(x2, 0b01000111);
BICw_REG_LSL(x1, x1, x2, 8);
ORRw_REG(x4, x4, x1);
STRH_U12(x4, xEmu, offsetof(x64emu_t, sw));
#else
MESSAGE(LOG_DUMP, "Need Optimization\n");
x87_refresh(dyn, ninst, x1, x2, 0);
i1 = x87_stackcount(dyn, ninst, x1);
x87_forget(dyn, ninst, x1, x2, 0);
//x87_refresh(dyn, ninst, x1, x2, 0);
CALL(fpu_fxam, -1); // should be possible inline, but is it worth it?
x87_unstackcount(dyn, ninst, x1, i1);
#endif
break;

case 0xE8:
INST_NAME("FLD1");
X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, NEON_CACHE_ST_F);
if(ST_IS_F(0)) {
FMOVS_8(v1, 0b01110000);
if(ninst<dyn->size+2 && (dyn->insts[ninst+1].pred_sz==1) && (dyn->insts[ninst+2].pred_sz==1)
&& PK(0)==0xD9 && PK(1)==0xE8
&& PK(2)==0xD9 && PK(3)==0xF3
) {
MESSAGE(LOG_DUMP, "Hack for FLD1 FLD1 FPATAN");
X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, NEON_CACHE_ST_F);
FTABLE64(v1, PI/4.0);
addr+=4;
} else {
FMOVD_8(v1, 0b01110000);
X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, NEON_CACHE_ST_F);
if(ST_IS_F(0)) {
FMOVS_8(v1, 0b01110000);
} else {
FMOVD_8(v1, 0b01110000);
}
}
break;
case 0xE9:
Expand Down

0 comments on commit 124672b

Please sign in to comment.