From 822dfbd012299511a346e96e267c1b6365f48516 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <npitre@baylibre.com>
Date: Thu, 29 Oct 2020 22:49:39 -0400
Subject: [PATCH] lib/os/prf.c: alternate implementation for _ldiv5()

The _ldiv5() is an optimized divide-by-5 function that is smaller and
faster than the generic libgcc implementation.

Yet it can be made even smaller and faster with this replacement
implementation based on a reciprocal multiplication plus some tricks.

For example, here's the assembly from the original code on ARM:

_ldiv5:
        ldr     r3, [r0]
        movw    ip, #52429
        ldr     r1, [r0, #4]
        movt    ip, 52428
        adds    r3, r3, #2
        push    {r4, r5, r6, r7, lr}
        mov     lr, #0
        adc     r1, r1, lr
        adds    r2, lr, lr
        umull   r7, r6, ip, r1
        lsr     r6, r6, #2
        adc     r7, r6, r6
        adds    r2, r2, r2
        adc     r7, r7, r7
        adds    r2, r2, lr
        adc     r7, r7, r6
        subs    r3, r3, r2
        sbc     r7, r1, r7
        lsr     r2, r3, #3
        orr     r2, r2, r7, lsl #29
        umull   r2, r1, ip, r2
        lsr     r2, r1, #2
        lsr     r7, r1, #31
        lsl     r1, r2, #3
        adds    r4, lr, r1
        adc     r5, r6, r7
        adds    r2, r1, r1
        adds    r2, r2, r2
        adds    r2, r2, r1
        subs    r2, r3, r2
        umull   r3, r2, ip, r2
        lsr     r2, r2, #2
        adds    r4, r4, r2
        adc     r5, r5, #0
        strd    r4, [r0]
        pop     {r4, r5, r6, r7, pc}

And here's the resulting assembly with this commit applied:

_ldiv5:
        push    {r4, r5, r6, r7}
        movw    r4, #13107
        ldr     r6, [r0]
        movt    r4, 13107
        ldr     r1, [r0, #4]
        mov     r3, #0
        umull   r6, r7, r6, r4
        add     r2, r4, r4, lsl #1
        umull   r4, r5, r1, r4
        adds    r1, r6, r2
        adc     r2, r7, r2
        adds    ip, r6, r4
        adc     r1, r7, r5
        adds    r2, ip, r2
        adc     r2, r1, r3
        adds    r2, r4, r2
        adc     r3, r5, r3
        strd    r2, [r0]
        pop     {r4, r5, r6, r7}
        bx      lr

So we're down to 20 instructions from 36 initially, with only 2 umull
instructions instead of 3, and slightly smaller stack footprint.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 lib/os/prf.c | 63 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/lib/os/prf.c b/lib/os/prf.c
index 0e42491d47dcd5..5d48d067554081 100644
--- a/lib/os/prf.c
+++ b/lib/os/prf.c
@@ -130,37 +130,56 @@ static	void _rlrshift(uint64_t *v)
  * sense to define this much smaller special case here to avoid
  * including it just for printf.
  *
- * It works by iteratively dividing the most significant 32 bits of
- * the 64 bit value by 5.  This will leave a remainder of 0-4
- * (i.e. three significant bits), ensuring that the top 29 bits of the
- * remainder are zero for the next iteration.  Thus in the second
- * iteration only 35 significant bits remain, and in the third only
- * six.  This was tested exhaustively through the first ~10B values in
- * the input space, and for ~2e12 (4 hours runtime) random inputs
- * taken from the full 64 bit space.
+ * It works by multiplying v by the reciprocal of 5 i.e.:
+ *
+ *	result = v * ((1 << 64) / 5) / (1 << 64)
+ *
+ * This produces a 128-bit result, but we drop the bottom 64 bits which
+ * accounts for the division by (1 << 64). The product is kept to 64 bits
+ * by summing partial multiplications and shifting right by 32 which on
+ * most 32-bit architectures means only a register drop.
+ *
+ * Here the multiplier is: (1 << 64) / 5 = 0x3333333333333333
+ * i.e. a 62 bits value. To compensate for the reduced precision, we
+ * add an initial bias of 1 to v. Enlarging the multiplier to 64 bits
+ * would also work but a final right shift would be needed, and carry
+ * handling on the summing of partial mults would be necessary, requiring
+ * more instructions. Given that we already want to add bias of 2 for
+ * the result to be rounded to nearest and not truncated, we might  as well
+ * combine those together into a bias of 3. This also conveniently allows
+ * for keeping the multiplier in a single 32-bit register given its pattern.
  */
 static void _ldiv5(uint64_t *v)
 {
-	uint32_t hi;
-	uint64_t rem = *v, quot = 0U, q;
-	int i;
+	uint32_t v_lo = *v;
+	uint32_t v_hi = *v >> 32;
+	uint32_t m = 0x33333333;
+	uint64_t result;
 
-	static const char shifts[] = { 32, 3, 0 };
+	/*
+	 * Force the multiplier constant into a register and make it
+	 * opaque to the compiler, otherwise gcc tries to be too smart
+	 * for its own good with a large expansion of adds and shifts.
+	 */
+	__asm__ ("" : "+r" (m));
 
 	/*
-	 * Usage in this file wants rounded behavior, not truncation.  So add
-	 * two to get the threshold right.
+	 * Apply the bias of 3. We can't add it to v as this would overflow
+	 * it when at max range. Factor it out with the multiplier upfront.
+	 * Here we multiply the low and high parts separately to avoid an
+	 * unnecessary 64-bit add-with-carry.
 	 */
-	rem += 2U;
+	result = ((uint64_t)(m * 3U) << 32) | (m * 3U);
 
-	for (i = 0; i < 3; i++) {
-		hi = rem >> shifts[i];
-		q = (uint64_t)(hi / 5U) << shifts[i];
-		rem -= q * 5U;
-		quot += q;
-	}
+	/* The actual multiplication. */
+	result += (uint64_t)v_lo * m;
+	result >>= 32;
+	result += (uint64_t)v_lo * m;
+	result += (uint64_t)v_hi * m;
+	result >>= 32;
+	result += (uint64_t)v_hi * m;
 
-	*v = quot;
+	*v = result;
 }
 
 static	char _get_digit(uint64_t *fr, int *digit_count)