Voodoo: Fix level of detail calculations with perspective correction …

…enabled
bochs-emu · Jan 2, 2025 · e7f2a4a · e7f2a4a
1 parent a8e83f2
commit e7f2a4a
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 120 deletions.
diff --git a/bochs/iodev/display/voodoo_data.h b/bochs/iodev/display/voodoo_data.h
@@ -106,17 +106,10 @@ enum
 #define REG_WPF         (REGISTER_WRITE | REGISTER_PIPELINED | REGISTER_FIFO)
 #define REG_RWPF        (REGISTER_READ | REGISTER_WRITE | REGISTER_PIPELINED | REGISTER_FIFO)
 
-/* lookup bits is the log2 of the size of the reciprocal/log table */
-#define RECIPLOG_LOOKUP_BITS  9
-
-/* input precision is how many fraction bits the input value has; this is a 64-bit number */
-#define RECIPLOG_INPUT_PREC   32
-
-/* lookup precision is how many fraction bits each table entry contains */
-#define RECIPLOG_LOOKUP_PREC  22
+/* lookup bits is the log2 of the size of the log table */
+#define LOG_LOOKUP_BITS     7
 
 /* output precision is how many fraction bits the result should have */
-#define RECIP_OUTPUT_PREC   15
 #define LOG_OUTPUT_PREC     8
 
 
@@ -2036,93 +2029,36 @@ BX_CPP_INLINE Bit8u _count_leading_zeros(Bit32u value)
 }
 #endif
 
-/*************************************
- *
- *  Computes a fast 16.16 reciprocal
- *  of a 16.32 value; used for
- *  computing 1/w in the rasterizer.
- *
- *  Since it is trivial to also
- *  compute log2(1/w) = -log2(w) at
- *  the same time, we do that as well
- *  to 16.8 precision for LOD
- *  calculations.
- *
- *  On a Pentium M, this routine is
- *  20% faster than a 64-bit integer
- *  divide and also produces the log
- *  for free.
- *
- *************************************/
+//-------------------------------------------------
+//  fast_log2 - computes the log2 of a double-
+//  precision value as a 24.8 value; if the double
+//  was converted from a fixed-point integer, the
+//  number of fractional bits should be specified
+//  by fracbits
+//-------------------------------------------------
 
-BX_CPP_INLINE Bit32s fast_reciplog(Bit64s value, Bit32s *log2)
+BX_CPP_INLINE Bit32s fast_log2(double value, int fracbits = 0)
 {
-  extern Bit32u voodoo_reciplog[];
-  Bit32u temp, recip, rlog;
-  Bit32u interp;
-  Bit32u *table;
-  int neg = false;
-  int lz, exp = 0;
-
-  /* always work with unsigned numbers */
-  if (value < 0)
-  {
-    value = -value;
-    neg = true;
-  }
-
-  /* if we've spilled out of 32 bits, push it down under 32 */
-  if (value & BX_CONST64(0xffff00000000))
-  {
-    temp = (Bit32u)(value >> 16);
-    exp -= 16;
-  }
-  else
-    temp = (Bit32u)value;
-
-  /* if the resulting value is 0, the reciprocal is infinite */
-  if (temp == 0)
-  {
-    *log2 = 1000 << LOG_OUTPUT_PREC;
-    return neg ? 0x80000000 : 0x7fffffff;
-  }
+  extern Bit8u voodoo_log[];
 
-  /* determine how many leading zeros in the value and shift it up high */
-  lz = count_leading_zeros(temp);
-  temp <<= lz;
-  exp += lz;
+  // negative values return 0
+  if (unlikely(value < 0))
+    return 0;
 
-  /* compute a pointer to the table entries we want */
-  /* math is a bit funny here because we shift one less than we need to in order */
-  /* to account for the fact that there are two Bit32u's per table entry */
-  table = &voodoo_reciplog[(temp >> (31 - RECIPLOG_LOOKUP_BITS - 1)) & ((2 << RECIPLOG_LOOKUP_BITS) - 2)];
+  // convert the value to a raw integer
+  union { double d; Bit64u i; } temp;
+  temp.d = value;
 
-  /* compute the interpolation value */
-  interp = (temp >> (31 - RECIPLOG_LOOKUP_BITS - 8)) & 0xff;
+  // we only care about the 11-bit exponent and top 7 bits of mantissa
+  // (sign is already assured to be 0)
+  Bit32u ival = temp.i >> 45;
 
-  /* do a linear interpolatation between the two nearest table values */
-  /* for both the log and the reciprocal */
-  rlog = (table[1] * (0x100 - interp) + table[3] * interp) >> 8;
-  recip = (table[0] * (0x100 - interp) + table[2] * interp) >> 8;
-
-  /* the log result is the fractional part of the log; round it to the output precision */
-  rlog = (rlog + (1 << (RECIPLOG_LOOKUP_PREC - LOG_OUTPUT_PREC - 1))) >> (RECIPLOG_LOOKUP_PREC - LOG_OUTPUT_PREC);
-
-  /* the exponent is the non-fractional part of the log; normally, we would subtract it from rlog */
-  /* but since we want the log(1/value) = -log(value), we subtract rlog from the exponent */
-  *log2 = ((exp - (31 - RECIPLOG_INPUT_PREC)) << LOG_OUTPUT_PREC) - rlog;
-
-  /* adjust the exponent to account for all the reciprocal-related parameters to arrive at a final shift amount */
-  exp += (RECIP_OUTPUT_PREC - RECIPLOG_LOOKUP_PREC) - (31 - RECIPLOG_INPUT_PREC);
-
-  /* shift by the exponent */
-  if (exp < 0)
-    recip >>= -exp;
-  else
-    recip <<= exp;
+  // extract exponent, unbias, and adjust for fixed-point fraction
+  Bit32s exp = (ival >> 7) - 1023 - fracbits;
 
-  /* on the way out, apply the original sign to the reciprocal */
-  return neg ? -((Bit32s)recip) : recip;
+  // use top 7 bits of mantissa to look up fractional log
+  // combine the integral and fractional parts
+  return (exp << 8) | voodoo_log[ival & 127];
 }
 
 
@@ -2876,18 +2812,29 @@ do
 {                                                                                \
   Bit32s blendr, blendg, blendb, blenda;                                         \
   Bit32s tr, tg, tb, ta;                                                         \
-  Bit32s oow, s, t, lod, ilod;                                                   \
+  Bit32s s, t, lod, ilod;                                                        \
+  double sf, tf, wf;                                                             \
+  double ds1f, dt1f, ds2f, dt2f, rho, rhox, rhoy;                                \
   Bit32s smax, tmax;                                                             \
   Bit32u texbase;                                                                \
   rgb_union c_local;                                                             \
                                                                                  \
   /* determine the S/T/LOD values for this texture */                            \
   if (TEXMODE_ENABLE_PERSPECTIVE(TEXMODE))                                       \
   {                                                                              \
-    oow = fast_reciplog((ITERW), &lod);                                          \
-    s = (Bit32s)((Bit64s)oow * (ITERS) >> 29);                                   \
-    t = (Bit32s)((Bit64s)oow * (ITERT) >> 29);                                   \
-    lod += (LODBASE);                                                            \
+    wf = 1.0 / (ITERW);                                                          \
+    sf = wf * (ITERS);                                                           \
+    tf = wf * (ITERT);                                                           \
+    s = (Bit32s)(Bit64s)(sf * 262144);                                           \
+    t = (Bit32s)(Bit64s)(tf * 262144);                                           \
+    ds1f = wf * ((TT)->dsdx - sf * (TT)->dwdx);                                  \
+    dt1f = wf * ((TT)->dtdx - tf * (TT)->dwdx);                                  \
+    rhox = ds1f * ds1f + dt1f * dt1f;                                            \
+    ds2f = wf * ((TT)->dsdy - sf * (TT)->dwdy);                                  \
+    dt2f = wf * ((TT)->dtdy - tf * (TT)->dwdy);                                  \
+    rhoy = ds2f * ds2f + dt2f * dt2f;                                            \
+    rho = BX_MAX(rhox, rhoy);                                                    \
+    lod = fast_log2(rho) / 2;                                                    \
   }                                                                              \
   else                                                                           \
   {                                                                              \

diff --git a/bochs/iodev/display/voodoo_func.h b/bochs/iodev/display/voodoo_func.h
@@ -76,8 +76,8 @@ static bx_thread_sem_t vertical_sem;
 static Bit8u dither4_lookup[256*16*2];
 static Bit8u dither2_lookup[256*16*2];
 
-/* fast reciprocal+log2 lookup */
-Bit32u voodoo_reciplog[(2 << RECIPLOG_LOOKUP_BITS) + 2];
+/* fast log2 lookup */
+Bit8u voodoo_log[1 << LOG_LOOKUP_BITS];
 
 
 void raster_function(int tmus, void *destbase, Bit32s y, const poly_extent *extent, const void *extradata, int threadid) {
@@ -435,9 +435,6 @@ void recompute_texture_params(tmu_state *t)
 
 BX_CPP_INLINE Bit32s prepare_tmu(tmu_state *t)
 {
-  Bit64s texdx, texdy;
-  Bit32s lodbase;
-
   /* if the texture parameters are dirty, update them */
   if (t->regdirty) {
     recompute_texture_params(t);
@@ -452,22 +449,22 @@ BX_CPP_INLINE Bit32s prepare_tmu(tmu_state *t)
     }
   }
 
-  /* compute (ds^2 + dt^2) in both X and Y as 28.36 numbers */
-  texdx = (Bit64s)(t->dsdx >> 14) * (Bit64s)(t->dsdx >> 14) + (Bit64s)(t->dtdx >> 14) * (Bit64s)(t->dtdx >> 14);
-  texdy = (Bit64s)(t->dsdy >> 14) * (Bit64s)(t->dsdy >> 14) + (Bit64s)(t->dtdy >> 14) * (Bit64s)(t->dtdy >> 14);
-
-  /* pick whichever is larger and shift off some high bits -> 28.20 */
-  if (texdx < texdy)
-    texdx = texdy;
-  texdx >>= 16;
-
-  /* use our fast reciprocal/log on this value; it expects input as a */
-  /* 16.32 number, and returns the log of the reciprocal, so we have to */
-  /* adjust the result: negative to get the log of the original value */
-  /* plus 12 to account for the extra exponent, and divided by 2 to */
-  /* get the log of the square root of texdx */
-  (void)fast_reciplog(texdx, &lodbase);
-  return (-lodbase + (12 << 8)) / 2;
+  // compute (ds^2 + dt^2) in both X and Y; note that these values are
+  // each .32, so the square is a .64 fixed point value
+  double fdsdx = (double)t->dsdx;
+  double fdsdy = (double)t->dsdy;
+  double fdtdx = (double)t->dtdx;
+  double fdtdy = (double)t->dtdy;
+  double texdx = fdsdx * fdsdx + fdtdx * fdtdx;
+  double texdy = fdsdy * fdsdy + fdtdy * fdtdy;
+
+  // pick whichever is larger
+  double maxval = BX_MAX(texdx, texdy);
+
+  // use our fast reciprocal/log on this value; 64 to indicate how many
+  // bits of fractional resolution in the source, and divide by 2 because
+  // we really want the log of the square root
+  return fast_log2(maxval, 64) / 2;
 }
 
 
@@ -4017,12 +4014,11 @@ void voodoo_init(Bit8u _type)
   v->pci.fifo.size = 64*2;
   v->pci.fifo.in = v->pci.fifo.out = 0;
 
-  /* create a table of precomputed 1/n and log2(n) values */
+  /* create a table of precomputed log2(n) values */
   /* n ranges from 1.0000 to 2.0000 */
-  for (val = 0; val <= (1 << RECIPLOG_LOOKUP_BITS); val++) {
-    Bit32u value = (1 << RECIPLOG_LOOKUP_BITS) + val;
-    voodoo_reciplog[val*2 + 0] = (1 << (RECIPLOG_LOOKUP_PREC + RECIPLOG_LOOKUP_BITS)) / value;
-    voodoo_reciplog[val*2 + 1] = (Bit32u)(LOGB2((double)value / (double)(1 << RECIPLOG_LOOKUP_BITS)) * (double)(1 << RECIPLOG_LOOKUP_PREC));
+  for (val = 0; val < (1 << LOG_LOOKUP_BITS); val++) {
+    Bit32u value = (1 << LOG_LOOKUP_BITS) + val;
+    voodoo_log[val] = (Bit32u)(LOGB2((double)value / (double)(1 << LOG_LOOKUP_BITS)) * (double)(1 << LOG_OUTPUT_PREC));
   }
 
   /* create dithering tables */