From df94bde3f8faa13fbd24f8b6a9866788e022d05b Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 6 Jun 2024 16:06:54 +0200 Subject: [PATCH 1/2] fix(rust): distinct count on BooleanArray --- crates/polars-arrow/src/bitmap/bitmap_ops.rs | 10 ++++ crates/polars-arrow/src/bitmap/immutable.rs | 7 ++- crates/polars-compute/src/distinct_count.rs | 56 +++++++++++++++++--- 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/crates/polars-arrow/src/bitmap/bitmap_ops.rs b/crates/polars-arrow/src/bitmap/bitmap_ops.rs index dbb4e1fc7bee..4392bdf25ee6 100644 --- a/crates/polars-arrow/src/bitmap/bitmap_ops.rs +++ b/crates/polars-arrow/src/bitmap/bitmap_ops.rs @@ -270,6 +270,16 @@ fn eq(lhs: &Bitmap, rhs: &Bitmap) -> bool { lhs_remainder.zip(rhs_remainder).all(|(x, y)| x == y) } +pub fn num_intersections_with(lhs: &Bitmap, rhs: &Bitmap) -> usize { + binary_fold( + lhs, + rhs, + |lhs, rhs| (lhs & rhs).count_ones() as usize, + 0, + |lhs, rhs| lhs + rhs, + ) +} + pub fn intersects_with(lhs: &Bitmap, rhs: &Bitmap) -> bool { binary_fold( lhs, diff --git a/crates/polars-arrow/src/bitmap/immutable.rs b/crates/polars-arrow/src/bitmap/immutable.rs index f75b2ff49dd8..0b85af625c14 100644 --- a/crates/polars-arrow/src/bitmap/immutable.rs +++ b/crates/polars-arrow/src/bitmap/immutable.rs @@ -6,7 +6,7 @@ use either::Either; use polars_error::{polars_bail, PolarsResult}; use super::utils::{count_zeros, fmt, get_bit, get_bit_unchecked, BitChunk, BitChunks, BitmapIter}; -use super::{chunk_iter_to_vec, intersects_with, IntoIter, MutableBitmap}; +use super::{chunk_iter_to_vec, intersects_with, num_intersections_with, IntoIter, MutableBitmap}; use crate::array::Splitable; use crate::bitmap::aligned::AlignedBitmapSlice; use crate::bitmap::iterator::{ @@ -480,6 +480,11 @@ impl Bitmap { pub fn intersects_with(&self, other: &Self) -> bool { intersects_with(self, other) } + + /// Calculates the number of shared set bits between two [`Bitmap`]s. + pub fn num_intersections_with(&self, other: &Self) -> usize { + num_intersections_with(self, other) + } } impl> From

for Bitmap { diff --git a/crates/polars-compute/src/distinct_count.rs b/crates/polars-compute/src/distinct_count.rs index 9d621eaa3ea5..f4bd8713ac78 100644 --- a/crates/polars-compute/src/distinct_count.rs +++ b/crates/polars-compute/src/distinct_count.rs @@ -8,17 +8,57 @@ pub trait DistinctCountKernel { impl DistinctCountKernel for BooleanArray { fn distinct_count(&self) -> usize { - if self.len() - self.null_count() == 0 { + if self.len() == 0 { return 0; } - if self.null_count() == 0 { - let unset_bits = self.values().unset_bits(); - 2 - usize::from(unset_bits == 0 || unset_bits == self.values().len()) - } else { - let values = self.values() & self.validity().unwrap(); - let unset_bits = self.values().unset_bits(); - 3 - usize::from(unset_bits == 0 || unset_bits == values.len()) + let null_count = self.null_count(); + + if self.len() == null_count { + return 1; + } + + let values = self.values(); + + if null_count == 0 { + let unset_bits = values.unset_bits(); + let is_uniform = unset_bits == 0 || unset_bits == values.len(); + return 2 - usize::from(is_uniform); } + + let validity = self.validity().unwrap(); + let set_bits = values.num_intersections_with(validity); + let is_uniform = set_bits == 0 || set_bits == validity.set_bits(); + 2 + usize::from(!is_uniform) } } + +#[test] +fn test_boolean_distinct_count() { + use arrow::bitmap::Bitmap; + use arrow::datatypes::ArrowDataType; + + macro_rules! assert_bool_dc { + ($values:expr, $validity:expr => $dc:expr) => { + let validity: Option = + >>::map($validity, |v| Bitmap::from_iter(v)); + let arr = + BooleanArray::new(ArrowDataType::Boolean, Bitmap::from_iter($values), validity); + assert_eq!(arr.distinct_count(), $dc); + }; + } + + assert_bool_dc!(vec![], None => 0); + assert_bool_dc!(vec![], Some(vec![]) => 0); + assert_bool_dc!(vec![true], None => 1); + assert_bool_dc!(vec![true], Some(vec![true]) => 1); + assert_bool_dc!(vec![true], Some(vec![false]) => 1); + assert_bool_dc!(vec![true, false], None => 2); + assert_bool_dc!(vec![true, false, false], None => 2); + assert_bool_dc!(vec![true, false, false], Some(vec![true, true, false]) => 3); + + // Copied from https://github.com/pola-rs/polars/pull/16765#discussion_r1629426159 + assert_bool_dc!(vec![true, true, true, true, true], Some(vec![true, false, true, false, false]) => 2); + assert_bool_dc!(vec![false, true, false, true, true], Some(vec![true, false, true, false, false]) => 2); + assert_bool_dc!(vec![true, false, true, false, true, true], Some(vec![true, true, false, true, false, false]) => 3); +} From 25a2265b4977d698e9fb281095e815531400cfcd Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 6 Jun 2024 16:08:20 +0200 Subject: [PATCH 2/2] fix(rust): properly calculate boolarray num uniq --- crates/polars-compute/src/distinct_count.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/polars-compute/src/distinct_count.rs b/crates/polars-compute/src/distinct_count.rs index f4bd8713ac78..a94643bb1382 100644 --- a/crates/polars-compute/src/distinct_count.rs +++ b/crates/polars-compute/src/distinct_count.rs @@ -1,8 +1,12 @@ use arrow::array::{Array, BooleanArray}; -/// Kernel to calculate the number of unique non-null elements +/// Kernel to calculate the number of unique elements +/// +/// A null is also considered a unique value pub trait DistinctCountKernel { - /// Calculate the number of unique non-null elements in [`Self`] + /// Calculate the number of unique elements in [`Self`] + /// + /// A null is also considered a unique value fn distinct_count(&self) -> usize; }