Cheaper compute_lowest_and_highest_weight (#516)

The actual weights array is constant, so we can compute the min/max weights in a single pass ahead of time. The remapping we apply to the weights depending on rcp_stepsize and offset is monotonic; therefore we can figure out minidx and maxidx in advance from the min/max weights we determined previously. This in turn means that we don't need to do the "reset" handling in the loop where we zero the accumulator whenever the current min/max changes. The amount of code in the inner loop nest is considerably reduced as a result, and the extra preprocessing outside that loop is generally less than the work it replaces. The exact impact depends on the block size and max_angular_steps. Testing with RGB textures and -thorough quality, I observe approximately 0.7% coding time reduction with 4x4 blocks; 6x6 and larger see a 1.2% reduction. At -fast quality, there is not much gain, but neither is it any slower. This change does not alter encoding results. Signed-off-by: Fabian Giesen <[email protected]>
ARM-software · Nov 9, 2024 · 33082c2 · 33082c2
1 parent 409a9da
commit 33082c2
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 15 deletions.
diff --git a/Docs/ChangeLog-5x.md b/Docs/ChangeLog-5x.md
@@ -22,6 +22,8 @@ The 5.1.0 release is a maintenance release.
   * **Optimization:** Added new `gather()` abstraction for gathers using byte
     indices, allowing implementations without gather hardware to skip the
     byte-to-int index conversion.
+  * **Optimization:** Optimized `compute_lowest_and_highest_weight()` to
+    pre-compute min/max outside of the main loop.
   * **Optimization:** Added improved intrinsics sequence for SSE and AVX2
     `hmin()` and `hmax()`.
   * **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)`

diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp
@@ -43,6 +43,7 @@
 #include <stdio.h>
 #include <cassert>
 #include <cstring>
+#include <cfloat>
 
 static constexpr unsigned int ANGULAR_STEPS { 32 };
 
@@ -169,39 +170,60 @@ static void compute_lowest_and_highest_weight(
 
 	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
 
+	// Compute minimum/maximum weights in the weight array. Our remapping
+	// is monotonic, so the min/max rounded weights relate to the min/max
+	// unrounded weights in a straightforward way.
+	vfloat min_weight(FLT_MAX);
+	vfloat max_weight(-FLT_MAX);
+	unsigned int partial_weight_start = round_down_to_simd_multiple_vla(weight_count);
+	for (unsigned int i = 0; i < partial_weight_start; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat weights = loada(dec_weight_ideal_value + i);
+		min_weight = min(min_weight, weights);
+		max_weight = max(max_weight, weights);
+	}
+
+	if (partial_weight_start != weight_count)
+	{
+		vfloat partial_weights = loada(dec_weight_ideal_value + partial_weight_start);
+		vmask active = vint::lane_id() < vint(weight_count - partial_weight_start);
+
+		vmask smaller = active & (partial_weights < min_weight);
+		min_weight = select(min_weight, partial_weights, smaller);
+
+		vmask larger = active & (partial_weights > max_weight);
+		max_weight = select(max_weight, partial_weights, larger);
+	}
+
+	min_weight = hmin(min_weight);
+	max_weight = hmax(max_weight);
+
 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
 	{
-		vfloat minidx(128.0f);
-		vfloat maxidx(-128.0f);
 		vfloat errval = vfloat::zero();
 		vfloat cut_low_weight_err = vfloat::zero();
 		vfloat cut_high_weight_err = vfloat::zero();
 		vfloat offset = loada(offsets + sp);
 
+		// We know the min and max weight values, so we can figure out
+		// the corresponding indices before we enter the loop.
+		vfloat minidx = round(min_weight * rcp_stepsize - offset);
+		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
+
 		for (unsigned int j = 0; j < weight_count; j++)
 		{
 			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
 			vfloat svalrte = round(sval);
 			vfloat diff = sval - svalrte;
 			errval += diff * diff;
 
-			// Reset tracker on min hit
-			vmask mask = svalrte < minidx;
-			minidx = select(minidx, svalrte, mask);
-			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on min hit
-			mask = svalrte == minidx;
+			// Accumulate errors for minimum index
+			vmask mask = svalrte == minidx;
 			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
 			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
 
-			// Reset tracker on max hit
-			mask = svalrte > maxidx;
-			maxidx = select(maxidx, svalrte, mask);
-			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on max hit
+			// Accumulate errors for maximum index
 			mask = svalrte == maxidx;
 			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
 			cut_high_weight_err = select(cut_high_weight_err, accum, mask);