diff --git a/crates/polars-core/src/chunked_array/from_iterator_par.rs b/crates/polars-core/src/chunked_array/from_iterator_par.rs index 12263053e368..eaf45d1d651f 100644 --- a/crates/polars-core/src/chunked_array/from_iterator_par.rs +++ b/crates/polars-core/src/chunked_array/from_iterator_par.rs @@ -82,21 +82,21 @@ where { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutablePrimitiveArray::new); - Self::from_chunk_iter("", chunks) + Self::from_chunk_iter("", chunks).optional_rechunk() } } impl FromParallelIterator for BooleanChunked { fn from_par_iter>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBooleanArray::new); - Self::from_chunk_iter("", chunks) + Self::from_chunk_iter("", chunks).optional_rechunk() } } impl FromParallelIterator> for BooleanChunked { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBooleanArray::new); - Self::from_chunk_iter("", chunks) + Self::from_chunk_iter("", chunks).optional_rechunk() } } @@ -106,7 +106,7 @@ where { fn from_par_iter>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks) + Self::from_chunk_iter("", chunks).optional_rechunk() } } @@ -116,7 +116,7 @@ where { fn from_par_iter>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks) + Self::from_chunk_iter("", chunks).optional_rechunk() } } @@ -126,7 +126,7 @@ where { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks) + Self::from_chunk_iter("", chunks).optional_rechunk() } } @@ -136,7 +136,7 @@ where { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks) + Self::from_chunk_iter("", chunks).optional_rechunk() } } diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index 05d682fc6ed9..72d1f1e53813 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -144,6 +144,19 @@ pub struct ChunkedArray { } impl ChunkedArray { + fn should_rechunk(&self) -> bool { + self.chunks.len() > 1 && self.chunks.len() > self.len() / 3 + } + + fn optional_rechunk(self) -> Self { + // Rechunk if we have many small chunks. + if self.should_rechunk() { + self.rechunk() + } else { + self + } + } + /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`. /// /// If you want to explicitly the `length` and `null_count`, look at @@ -181,7 +194,7 @@ impl ChunkedArray { /// Get a reference to the used [`Metadata`] /// - /// This results a reference to an empty [`Metadata`] if its unset for this [`ChunkedArray`]. + /// This results a reference to an empty [`Metadata`] if it's unset for this [`ChunkedArray`]. #[inline(always)] pub fn effective_metadata(&self) -> &Metadata { self.md.as_ref().map_or(&Metadata::DEFAULT, AsRef::as_ref) diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index 5a0169752c79..97f25abc4196 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -688,7 +688,7 @@ fn set_by_groups( macro_rules! dispatch { ($ca:expr) => {{ - set_numeric($ca, groups, len) + Some(set_numeric($ca, groups, len)) }}; } downcast_as_macro_arg_physical!(&s, dispatch).map(|s| s.cast(dtype).unwrap()) @@ -697,7 +697,7 @@ fn set_by_groups( } } -fn set_numeric(ca: &ChunkedArray, groups: &GroupsProxy, len: usize) -> Option +fn set_numeric(ca: &ChunkedArray, groups: &GroupsProxy, len: usize) -> Series where T: PolarsNumericType, ChunkedArray: IntoSeries, @@ -709,10 +709,10 @@ where let sync_ptr_values = unsafe { SyncPtr::new(ptr) }; if ca.null_count() == 0 { + let ca = ca.rechunk(); match groups { GroupsProxy::Idx(groups) => { - // this should always succeed as we don't expect any chunks after an aggregation - let agg_vals = ca.cont_slice().ok()?; + let agg_vals = ca.cont_slice().expect("rechunked"); POOL.install(|| { agg_vals .par_iter() @@ -727,8 +727,7 @@ where }) }, GroupsProxy::Slice { groups, .. } => { - // this should always succeed as we don't expect any chunks after an aggregation - let agg_vals = ca.cont_slice().ok()?; + let agg_vals = ca.cont_slice().expect("rechunked"); POOL.install(|| { agg_vals .par_iter() @@ -748,7 +747,7 @@ where // SAFETY: we have written all slots unsafe { values.set_len(len) } - Some(ChunkedArray::new_vec(ca.name(), values).into_series()) + ChunkedArray::new_vec(ca.name(), values).into_series() } else { // We don't use a mutable bitmap as bits will have have race conditions! // A single byte might alias if we write from single threads. @@ -826,6 +825,6 @@ where values.into(), Some(validity), ); - Some(Series::try_from((ca.name(), arr.boxed())).unwrap()) + Series::try_from((ca.name(), arr.boxed())).unwrap() } } diff --git a/crates/polars/tests/it/lazy/functions.rs b/crates/polars/tests/it/lazy/functions.rs index 8b4cc810d2ae..ff6dfec5a0b1 100644 --- a/crates/polars/tests/it/lazy/functions.rs +++ b/crates/polars/tests/it/lazy/functions.rs @@ -1,6 +1,7 @@ use super::*; #[test] +#[cfg(all(feature = "concat_str", feature = "strings"))] fn test_format_str() { let a = df![ "a" => [1, 2],