From ab157d907f8278b862ac1f7cf047acb877f63a76 Mon Sep 17 00:00:00 2001
From: Patrick Damme <patrick.damme@tu-berlin.de>
Date: Fri, 13 Dec 2024 00:42:52 +0100
Subject: [PATCH] Performance improvement of the FilterCol-kernel on
 DenseMatrix.

- This commit changes the algorithm used to do the main work of the kernel.
- The previous algorithm evaluated all entries of the selection bit vector for every row of the input matrix.
- The new algorithm creates a vector of positions from the input bit vector first and evaluates only those positions for every row of the input matrix.
- With the value types we currently use, the new approach should always be faster than the old one; but the trade-offs may change with other value types for the bit vector and positions (depending on the number of 1 bits).
- With the old algorithm, FilterCol was the most expensive operation in decision trees/random forests for a particular use-case script; the new algorithm speeds up that script by 4x on my machine.
---
 src/runtime/local/kernels/FilterCol.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
diff --git a/src/runtime/local/kernels/FilterCol.h b/src/runtime/local/kernels/FilterCol.h
index 3e9671456..1d6b68227 100644
--- a/src/runtime/local/kernels/FilterCol.h
+++ b/src/runtime/local/kernels/FilterCol.h
@@ -73,6 +73,16 @@ template <typename VT, typename VTSel> struct FilterCol<DenseMatrix<VT>, DenseMa
         VT *valuesRes = res->getValues();
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
+
+        // Two alternative approaches for doing the main work.
+        // Note that even though sel is essentially a bit vector, we represent it as a 64-bit integer at the moment, for
+        // simplicity. As both the elements in sel used in approach 1 and the positions in approach 2 are currently
+        // represented as 64-bit integers, approach 2 should always be faster than approach 1, because in approach 2 we
+        // iterate over at most as many 64-bit values as in approach 1 while we can omit the check. Once we change to a
+        // 1-bit representation for the values in sel, we should rethink the trade-off between the two approaches.
+#if 0 // approach 1
+      // For every row in arg, iterate over all elements in sel (one per column in arg), check if the respective
+      // column should be part of the output and if so, copy the value to the output.
         for (size_t r = 0; r < numRows; r++) {
             for (size_t ca = 0, cr = 0; ca < numColsArg; ca++)
                 if (sel->get(ca, 0))
@@ -80,6 +90,22 @@ template <typename VT, typename VTSel> struct FilterCol<DenseMatrix<VT>, DenseMa
             valuesArg += rowSkipArg;
             valuesRes += rowSkipRes;
         }
+#else // approach 2
+      // Once in the beginning, create a vector of the positions of the columns we want to copy to the output.
+      // Negligible effort, unless the number of rows in arg is very small.
+        std::vector<size_t> positions;
+        for (size_t c = 0; c < numColsArg; c++)
+            if (sel->get(c, 0))
+                positions.push_back(c);
+        // For every row in arg, iterate over the array of those positions and copy the value in the respective column
+        // to the output.
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t i = 0; i < positions.size(); i++)
+                valuesRes[i] = valuesArg[positions[i]];
+            valuesArg += rowSkipArg;
+            valuesRes += rowSkipRes;
+        }
+#endif
     }
 };