Skip to content

Commit

Permalink
Update fmod algorithm
Browse files Browse the repository at this point in the history
This patch will update the algorithm for fmod to the version written in the paper.
  • Loading branch information
shibatch authored Sep 30, 2019
1 parent 50fb6a4 commit 8b77cc4
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 33 deletions.
27 changes: 15 additions & 12 deletions src/libm/sleefdp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2408,22 +2408,25 @@ static INLINE CONST double ptrunc(double x) {
}

EXPORT CONST double xfmod(double x, double y) {
double nu = fabsk(x), de = fabsk(y), s = 1, q;
if (de < DBL_MIN) { nu *= 1ULL << 54; de *= 1ULL << 54; s = 1.0 / (1ULL << 54); }
Sleef_double2 r = dd(nu, 0);
double rde = toward0(1.0 / de);

for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 51) + 1
q = (de+de > r.x && r.x >= de) ? 1 : (toward0(r.x) * rde);
r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(removelsb(ptrunc(q)), -de)));
if (r.x < de) break;
double n = fabsk(x), d = fabsk(y), s = 1, q;
if (d < DBL_MIN) { n *= 1ULL << 54; d *= 1ULL << 54; s = 1.0 / (1ULL << 54); }
Sleef_double2 r = dd(n, 0);
double rd = toward0(1.0 / d);

for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52)
q = removelsb(ptrunc(toward0(r.x) * rd));
q = (3*d > r.x && r.x > d) ? 2 : q;
q = (2*d > r.x && r.x > d) ? 1 : q;
q = r.x == d ? (r.y >= 0 ? 1 : 0) : q;
r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d)));
if (r.x < d) break;
}

double ret = r.x * s;
if (r.x + r.y == de) ret = 0;
if (r.x + r.y == d) ret = 0;
ret = mulsign(ret, x);
if (nu < de) ret = x;
if (de == 0) ret = SLEEF_NAN;
if (n < d) ret = x;
if (d == 0) ret = SLEEF_NAN;

return ret;
}
Expand Down
40 changes: 23 additions & 17 deletions src/libm/sleefsimddp.c
Original file line number Diff line number Diff line change
Expand Up @@ -3298,30 +3298,36 @@ static INLINE CONST VECTOR_CC vdouble vptrunc(vdouble x) { // round to integer t

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
EXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) {
vdouble nu = vabs_vd_vd(x), de = vabs_vd_vd(y), s = vcast_vd_d(1), q;
vopmask o = vlt_vo_vd_vd(de, vcast_vd_d(DBL_MIN));
nu = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(nu, vcast_vd_d(1ULL << 54)), nu);
de = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(de, vcast_vd_d(1ULL << 54)), de);
vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q;
vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(1ULL << 54)), n);
d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1ULL << 54)), d);
s = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (1ULL << 54))), s);
vdouble rde = vtoward0(vrec_vd_vd(de));
vdouble2 r = vcast_vd2_vd_vd(nu, vcast_vd_d(0));

for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 51) + 1
q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(de, de), r.x),
vge_vo_vd_vd(r.x, de)),
vcast_vd_d(1), vmul_vd_vd_vd(vtoward0(r.x), rde));
q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(vptrunc(q)), vcast_vm_i_i(0xffffffff, 0xfffffffe)));
r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(de))));
if (vtestallones_i_vo64(vlt_vo_vd_vd(r.x, de))) break;
vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0));
vdouble rd = vtoward0(vrec_vd_vd(d));

for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52)
q = vptrunc(vmul_vd_vd_vd(vtoward0(r.x), rd));
#ifndef ENABLE_FMA_DP
q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe)));
#endif
q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vmul_vd_vd_vd(vcast_vd_d(3), d), r.x),
vge_vo_vd_vd(r.x, d)),
vcast_vd_d(2), q);
q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(d, d), r.x),
vge_vo_vd_vd(r.x, d)),
vcast_vd_d(1), q);
r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d))));
if (vtestallones_i_vo64(vlt_vo_vd_vd(r.x, d))) break;
}

vdouble ret = vmul_vd_vd_vd(r.x, s);
ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(r.x, r.y), de), vcast_vd_d(0), ret);
ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(r.x, r.y), d), vcast_vd_d(0), ret);

ret = vmulsign_vd_vd_vd(ret, x);

ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(nu, de), x, ret);
ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(de, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret);
ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(n, d), x, ret);
ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret);

return ret;
}
Expand Down
8 changes: 6 additions & 2 deletions src/libm/sleefsimdsp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2902,9 +2902,13 @@ EXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) {
vfloat2 r = vcast_vf2_vf_vf(nu, vcast_vf_f(0));

for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vadd_vf_vf_vf(de, de), r.x),
q = vptruncf(vmul_vf_vf_vf(vtoward0f(r.x), rde));
q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(3), de), r.x),
vge_vo_vf_vf(r.x, de)),
vcast_vf_f(1), vmul_vf_vf_vf(vtoward0f(r.x), rde));
vcast_vf_f(2), q);
q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2), de), r.x),
vge_vo_vf_vf(r.x, de)),
vcast_vf_f(1), q);
r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(vptruncf(q), vneg_vf_vf(de))));
if (vtestallones_i_vo32(vlt_vo_vf_vf(r.x, de))) break;
}
Expand Down
6 changes: 4 additions & 2 deletions src/libm/sleefsp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2020,8 +2020,10 @@ EXPORT CONST float xfmodf(float x, float y) {
float rde = toward0f(1.0f / de);

for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
q = (de+de > r.x && r.x >= de) ? 1.0f : (toward0f(r.x) * rde);
r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(ptruncf(q), -de)));
q = ptruncf(toward0f(r.x) * rde);
q = (3*de > r.x && r.x >= de) ? 2 : q;
q = (2*de > r.x && r.x >= de) ? 1 : q;
r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -de)));
if (r.x < de) break;
}

Expand Down

0 comments on commit 8b77cc4

Please sign in to comment.