From ab157d907f8278b862ac1f7cf047acb877f63a76 Mon Sep 17 00:00:00 2001 From: Patrick Damme Date: Fri, 13 Dec 2024 00:42:52 +0100 Subject: [PATCH] Performance improvement of the FilterCol-kernel on DenseMatrix. - This commit changes the algorithm used to do the main work of the kernel. - The previous algorithm evaluated all entries of the selection bit vector for every row of the input matrix. - The new algorithm creates a vector of positions from the input bit vector first and evaluates only those positions for every row of the input matrix. - With the value types we currently use, the new approach should always be faster than the old one; but the trade-offs may change with other value types for the bit vector and positions (depending on the number of 1 bits). - With the old algorithm, FilterCol was the most expensive operation in decision trees/random forests for a particular use-case script; the new algorithm speeds up that script by 4x on my machine. --- src/runtime/local/kernels/FilterCol.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/runtime/local/kernels/FilterCol.h b/src/runtime/local/kernels/FilterCol.h index 3e9671456..1d6b68227 100644 --- a/src/runtime/local/kernels/FilterCol.h +++ b/src/runtime/local/kernels/FilterCol.h @@ -73,6 +73,16 @@ template struct FilterCol, DenseMa VT *valuesRes = res->getValues(); const size_t rowSkipArg = arg->getRowSkip(); const size_t rowSkipRes = res->getRowSkip(); + + // Two alternative approaches for doing the main work. + // Note that even though sel is essentially a bit vector, we represent it as a 64-bit integer at the moment, for + // simplicity. As both the elements in sel used in approach 1 and the positions in approach 2 are currently + // represented as 64-bit integers, approach 2 should always be faster than approach 1, because in approach 2 we + // iterate over at most as many 64-bit values as in approach 1 while we can omit the check. Once we change to a + // 1-bit representation for the values in sel, we should rethink the trade-off between the two approaches. +#if 0 // approach 1 + // For every row in arg, iterate over all elements in sel (one per column in arg), check if the respective + // column should be part of the output and if so, copy the value to the output. for (size_t r = 0; r < numRows; r++) { for (size_t ca = 0, cr = 0; ca < numColsArg; ca++) if (sel->get(ca, 0)) @@ -80,6 +90,22 @@ template struct FilterCol, DenseMa valuesArg += rowSkipArg; valuesRes += rowSkipRes; } +#else // approach 2 + // Once in the beginning, create a vector of the positions of the columns we want to copy to the output. + // Negligible effort, unless the number of rows in arg is very small. + std::vector positions; + for (size_t c = 0; c < numColsArg; c++) + if (sel->get(c, 0)) + positions.push_back(c); + // For every row in arg, iterate over the array of those positions and copy the value in the respective column + // to the output. + for (size_t r = 0; r < numRows; r++) { + for (size_t i = 0; i < positions.size(); i++) + valuesRes[i] = valuesArg[positions[i]]; + valuesArg += rowSkipArg; + valuesRes += rowSkipRes; + } +#endif } };