Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(rust): properly set boolean distinct count #16782

Merged
merged 2 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions crates/polars-arrow/src/bitmap/bitmap_ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,16 @@ fn eq(lhs: &Bitmap, rhs: &Bitmap) -> bool {
lhs_remainder.zip(rhs_remainder).all(|(x, y)| x == y)
}

pub fn num_intersections_with(lhs: &Bitmap, rhs: &Bitmap) -> usize {
binary_fold(
lhs,
rhs,
|lhs, rhs| (lhs & rhs).count_ones() as usize,
0,
|lhs, rhs| lhs + rhs,
)
}

pub fn intersects_with(lhs: &Bitmap, rhs: &Bitmap) -> bool {
binary_fold(
lhs,
Expand Down
7 changes: 6 additions & 1 deletion crates/polars-arrow/src/bitmap/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use either::Either;
use polars_error::{polars_bail, PolarsResult};

use super::utils::{count_zeros, fmt, get_bit, get_bit_unchecked, BitChunk, BitChunks, BitmapIter};
use super::{chunk_iter_to_vec, intersects_with, IntoIter, MutableBitmap};
use super::{chunk_iter_to_vec, intersects_with, num_intersections_with, IntoIter, MutableBitmap};
use crate::array::Splitable;
use crate::bitmap::aligned::AlignedBitmapSlice;
use crate::bitmap::iterator::{
Expand Down Expand Up @@ -480,6 +480,11 @@ impl Bitmap {
pub fn intersects_with(&self, other: &Self) -> bool {
intersects_with(self, other)
}

/// Calculates the number of shared set bits between two [`Bitmap`]s.
pub fn num_intersections_with(&self, other: &Self) -> usize {
num_intersections_with(self, other)
}
}

impl<P: AsRef<[bool]>> From<P> for Bitmap {
Expand Down
64 changes: 54 additions & 10 deletions crates/polars-compute/src/distinct_count.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,68 @@
use arrow::array::{Array, BooleanArray};

/// Kernel to calculate the number of unique non-null elements
/// Kernel to calculate the number of unique elements
///
/// A null is also considered a unique value
pub trait DistinctCountKernel {
/// Calculate the number of unique non-null elements in [`Self`]
/// Calculate the number of unique elements in [`Self`]
///
/// A null is also considered a unique value
fn distinct_count(&self) -> usize;
}

impl DistinctCountKernel for BooleanArray {
fn distinct_count(&self) -> usize {
if self.len() - self.null_count() == 0 {
if self.len() == 0 {
return 0;
}

if self.null_count() == 0 {
let unset_bits = self.values().unset_bits();
2 - usize::from(unset_bits == 0 || unset_bits == self.values().len())
} else {
let values = self.values() & self.validity().unwrap();
let unset_bits = self.values().unset_bits();
3 - usize::from(unset_bits == 0 || unset_bits == values.len())
let null_count = self.null_count();

if self.len() == null_count {
return 1;
}

let values = self.values();

if null_count == 0 {
let unset_bits = values.unset_bits();
let is_uniform = unset_bits == 0 || unset_bits == values.len();
return 2 - usize::from(is_uniform);
}

let validity = self.validity().unwrap();
let set_bits = values.num_intersections_with(validity);
let is_uniform = set_bits == 0 || set_bits == validity.set_bits();
2 + usize::from(!is_uniform)
}
}

#[test]
fn test_boolean_distinct_count() {
use arrow::bitmap::Bitmap;
use arrow::datatypes::ArrowDataType;

macro_rules! assert_bool_dc {
($values:expr, $validity:expr => $dc:expr) => {
let validity: Option<Bitmap> =
<Option<Vec<bool>>>::map($validity, |v| Bitmap::from_iter(v));
let arr =
BooleanArray::new(ArrowDataType::Boolean, Bitmap::from_iter($values), validity);
assert_eq!(arr.distinct_count(), $dc);
};
}

assert_bool_dc!(vec![], None => 0);
assert_bool_dc!(vec![], Some(vec![]) => 0);
assert_bool_dc!(vec![true], None => 1);
assert_bool_dc!(vec![true], Some(vec![true]) => 1);
assert_bool_dc!(vec![true], Some(vec![false]) => 1);
assert_bool_dc!(vec![true, false], None => 2);
assert_bool_dc!(vec![true, false, false], None => 2);
assert_bool_dc!(vec![true, false, false], Some(vec![true, true, false]) => 3);

// Copied from https://github.com/pola-rs/polars/pull/16765#discussion_r1629426159
assert_bool_dc!(vec![true, true, true, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![false, true, false, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![true, false, true, false, true, true], Some(vec![true, true, false, true, false, false]) => 3);
}