From 95906a09dc5ac777889059fac3d3cab396d4e0f5 Mon Sep 17 00:00:00 2001 From: Rouven Spreckels Date: Fri, 21 Apr 2023 22:17:36 +0200 Subject: [PATCH] Make `FreedmanDiaconis` strategy more robust. Use improper IQR and Scott's rule as asymptotic resort. Introduce maximum number of bins with `u16::MAX` as default. Compute `n_bins` arithmetically instead of iteratively. --- Cargo.toml | 2 +- RELEASES.md | 6 ++ src/histogram/strategies.rs | 195 ++++++++++++++++++++++++++---------- 3 files changed, 149 insertions(+), 54 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6a6e345..fe648f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ndarray-histogram" -version = "0.1.0" +version = "0.2.0" rust-version = "1.60" edition = "2021" authors = ["Rouven Spreckels "] diff --git a/RELEASES.md b/RELEASES.md index 2fdf1bf..3080a30 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -1,3 +1,9 @@ +# Version 0.2.0 (2023-04-21) + + * Make `FreedmanDiaconis` strategy more robust. Use improper IQR and Scott's + rule as asymptotic resort. Introduce maximum number of bins with `u16::MAX` + as default. Compute `n_bins` arithmetically instead of iteratively. + # Version 0.1.0 (2023-04-13) * Fix binning strategies by using [`ndarray-slice`], which as well comes with diff --git a/src/histogram/strategies.rs b/src/histogram/strategies.rs index 109a9fd..d0566cc 100644 --- a/src/histogram/strategies.rs +++ b/src/histogram/strategies.rs @@ -27,8 +27,8 @@ //! //! # Notes //! -//! In general, successful infererence on optimal bin width and number of bins relies on -//! **variability** of data. In other word, the provided ovservations should not be empty or +//! In general, successful inference on optimal bin width and number of bins relies on +//! **variability** of data. In other word, the provided observations should not be empty or //! constant. //! //! In addition, [`Auto`] and [`FreedmanDiaconis`] requires the [`interquartile range (IQR)`][iqr], @@ -52,7 +52,7 @@ use crate::{ }; use ndarray::{prelude::*, Data}; use noisy_float::types::n64; -use num_traits::{FromPrimitive, NumOps, Zero}; +use num_traits::{FromPrimitive, NumOps, ToPrimitive, Zero}; /// A trait implemented by all strategies to build [`Bins`] with parameters inferred from /// observations. @@ -66,16 +66,40 @@ use num_traits::{FromPrimitive, NumOps, Zero}; pub trait BinsBuildingStrategy { #[allow(missing_docs)] type Elem: Ord + Send; - /// Returns a strategy that has learnt the required parameter fo building [`Bins`] for given + /// Returns a strategy that has learnt the required parameter for building [`Bins`] for given /// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter /// with the given data and specified strategy. /// + /// Calls [`Self::from_array_with_max`] with `max_n_bins` of [`u16::MAX`]. + /// /// # Errors /// - /// See each of the struct-level documentation for details on errors an implementor may return. + /// See each of the `struct`-level documentation for details on errors an implementation may + /// return. /// /// [`Bins`]: ../struct.Bins.html fn from_array(array: &ArrayBase) -> Result + where + S: Data, + Self: std::marker::Sized, + { + Self::from_array_with_max(array, u16::MAX.into()) + } + + /// Returns a strategy that has learnt the required parameter for building [`Bins`] for given + /// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter + /// with the given data and specified strategy. + /// + /// # Errors + /// + /// See each of the `struct`-level documentation for details on errors an implementation may + /// return. Fails if the strategy requires more bins than `max_n_bins`. + /// + /// [`Bins`]: ../struct.Bins.html + fn from_array_with_max( + array: &ArrayBase, + max_n_bins: usize, + ) -> Result where S: Data, Self: std::marker::Sized; @@ -157,13 +181,19 @@ pub struct Sturges { /// Robust (resilient to outliers) strategy that takes into account data variability and data size. /// -/// Let `n` be the number of observations. +/// Let `n` be the number of observations and `at = 1 / 4`. /// -/// `bin_width` = 2 × `IQR` × `n`−1/3 +/// `bin_width` = (1 - 2 × `at`) × `IQR` × `n`−1/3 /// -/// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to -/// cube root of `n`. It can be too conservative for small datasets, but it is quite good for large -/// datasets. +/// The bin width is proportional to the interquartile range ([`IQR`]) from `at` to `1 - at` and +/// inversely proportional to cube root of `n`. It can be too conservative for small datasets, but +/// it is quite good for large datasets. In case the [`IQR`] is close to zero, `at` is halved and an +/// improper [`IQR`] is computed. This is repeated as long as `at >= 1 / 512`. If no [`IQR`] is +/// found by then, Scott's rule is used as asymptotic resort which is based on the standard +/// deviation (SD). If the SD is close to zero as well, this strategy fails with +/// [`BinsBuildError::Strategy`]. As there is no one-fit-all epsilon, whether the IQR or standard +/// deviation is close to zero is indirectly tested by requiring the computed number of bins to not +/// exceed `max_n_bins` with a default of [`u16::MAX`]. /// /// The [`IQR`] is very robust to outliers. /// @@ -213,7 +243,7 @@ pub struct Auto { impl EquiSpaced where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { /// Returns `Err(BinsBuildError::Strategy)` if `bin_width<=0` or `min` >= `max`. /// Returns `Ok(Self)` otherwise. @@ -240,13 +270,10 @@ where } fn n_bins(&self) -> usize { - let mut max_edge = self.min.clone(); - let mut n_bins = 0; - while max_edge <= self.max { - max_edge = max_edge + self.bin_width.clone(); - n_bins += 1; - } - n_bins + let min = self.min.to_f64().unwrap(); + let max = self.max.to_f64().unwrap(); + let bin_width = self.bin_width.to_f64().unwrap(); + usize::from_f64(((max - min) / bin_width + 0.5).ceil()).unwrap_or(usize::MAX) } fn bin_width(&self) -> T { @@ -256,14 +283,17 @@ where impl BinsBuildingStrategy for Sqrt where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { type Elem = T; /// Returns `Err(BinsBuildError::Strategy)` if the array is constant. /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`. /// Returns `Ok(Self)` otherwise. - fn from_array(a: &ArrayBase) -> Result + fn from_array_with_max( + a: &ArrayBase, + max_n_bins: usize, + ) -> Result where S: Data, { @@ -278,7 +308,11 @@ where let max = a.max()?; let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins); let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?; - Ok(Self { builder }) + if builder.n_bins() > max_n_bins { + Err(BinsBuildError::Strategy) + } else { + Ok(Self { builder }) + } } fn build(&self) -> Bins { @@ -292,7 +326,7 @@ where impl Sqrt where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { /// The bin width (or bin length) according to the fitted strategy. pub fn bin_width(&self) -> T { @@ -302,14 +336,17 @@ where impl BinsBuildingStrategy for Rice where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { type Elem = T; /// Returns `Err(BinsBuildError::Strategy)` if the array is constant. /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`. /// Returns `Ok(Self)` otherwise. - fn from_array(a: &ArrayBase) -> Result + fn from_array_with_max( + a: &ArrayBase, + max_n_bins: usize, + ) -> Result where S: Data, { @@ -324,7 +361,11 @@ where let max = a.max()?; let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins); let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?; - Ok(Self { builder }) + if builder.n_bins() > max_n_bins { + Err(BinsBuildError::Strategy) + } else { + Ok(Self { builder }) + } } fn build(&self) -> Bins { @@ -338,7 +379,7 @@ where impl Rice where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { /// The bin width (or bin length) according to the fitted strategy. pub fn bin_width(&self) -> T { @@ -348,14 +389,17 @@ where impl BinsBuildingStrategy for Sturges where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { type Elem = T; /// Returns `Err(BinsBuildError::Strategy)` if the array is constant. /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`. /// Returns `Ok(Self)` otherwise. - fn from_array(a: &ArrayBase) -> Result + fn from_array_with_max( + a: &ArrayBase, + max_n_bins: usize, + ) -> Result where S: Data, { @@ -370,7 +414,11 @@ where let max = a.max()?; let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins); let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?; - Ok(Self { builder }) + if builder.n_bins() > max_n_bins { + Err(BinsBuildError::Strategy) + } else { + Ok(Self { builder }) + } } fn build(&self) -> Bins { @@ -384,7 +432,7 @@ where impl Sturges where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { /// The bin width (or bin length) according to the fitted strategy. pub fn bin_width(&self) -> T { @@ -394,14 +442,27 @@ where impl BinsBuildingStrategy for FreedmanDiaconis where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { type Elem = T; - /// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`. + /// Returns `Err(BinsBuildError::Strategy)` if improper IQR and SD are close to zero. /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`. /// Returns `Ok(Self)` otherwise. fn from_array(a: &ArrayBase) -> Result + where + S: Data, + { + Self::from_array_with_max(a, u16::MAX.into()) + } + + /// Returns `Err(BinsBuildError::Strategy)` if improper IQR and SD are close to zero. + /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`. + /// Returns `Ok(Self)` otherwise. + fn from_array_with_max( + a: &ArrayBase, + max_n_bins: usize, + ) -> Result where S: Data, { @@ -410,15 +471,48 @@ where return Err(BinsBuildError::EmptyInput); } - let mut a_copy = a.to_owned(); - let first_quartile = a_copy.quantile_mut(n64(0.25), &Nearest).unwrap(); - let third_quartile = a_copy.quantile_mut(n64(0.75), &Nearest).unwrap(); - let iqr = third_quartile - first_quartile; - - let bin_width = FreedmanDiaconis::compute_bin_width(n_points, iqr); + let n_cbrt = f64::from_usize(n_points).unwrap().powf(1. / 3.); let min = a.min()?; let max = a.max()?; + let mut a_copy = a.to_owned(); + // As there is no one-fit-all epsilon to decide whether IQR is zero, translate it into + // number of bins and compare it against `max_n_bins`. More bins than `max_n_bins` is a hint + // for an IQR close to zero. If so, deviate from proper Freedman-Diaconis rule by widening + // percentiles range and try again with `at` of 1/8, 1/16, 1/32, 1/64, 1/128, 1/256, 1/512. + let mut at = 0.5; + while at >= 1. / 512. { + at *= 0.5; + let first_quartile = a_copy.quantile_mut(n64(at), &Nearest).unwrap(); + let third_quartile = a_copy.quantile_mut(n64(1. - at), &Nearest).unwrap(); + let iqr = third_quartile - first_quartile; + let denom = T::from_f64((1. - 2. * at) * n_cbrt).unwrap(); + if denom == T::zero() { + continue; + } + let bin_width = iqr.clone() / denom; + let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?; + if builder.n_bins() > max_n_bins { + continue; + } + return Ok(Self { builder }); + } + // If the improper IQR is still close to zero, use Scott's rule as asymptotic resort before + // giving up where `m` is the mean and `s` its SD. + let m = a.iter().cloned().fold(T::zero(), |s, v| s + v) / T::from_usize(n_points).unwrap(); + let s = a + .iter() + .cloned() + .map(|v| (v.clone() - m.clone()) * (v - m.clone())) + .fold(T::zero(), |s, v| s + v); + let s = (s / T::from_usize(n_points - 1).unwrap()) + .to_f64() + .unwrap() + .sqrt(); + let bin_width = T::from_f64(3.49 * s).unwrap() / T::from_f64(n_cbrt).unwrap(); let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?; + if builder.n_bins() > max_n_bins { + return Err(BinsBuildError::Strategy); + } Ok(Self { builder }) } @@ -433,16 +527,8 @@ where impl FreedmanDiaconis where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { - fn compute_bin_width(n_bins: usize, iqr: T) -> T { - // casting `n_bins: usize` to `f64` may casus off-by-one error here if `n_bins` > 2 ^ 53, - // but it's not relevant here - #[allow(clippy::cast_precision_loss)] - let denominator = (n_bins as f64).powf(1. / 3.); - T::from_usize(2).unwrap() * iqr / T::from_f64(denominator).unwrap() - } - /// The bin width (or bin length) according to the fitted strategy. pub fn bin_width(&self) -> T { self.builder.bin_width() @@ -451,19 +537,22 @@ where impl BinsBuildingStrategy for Auto where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { type Elem = T; /// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`. /// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`. /// Returns `Ok(Self)` otherwise. - fn from_array(a: &ArrayBase) -> Result + fn from_array_with_max( + a: &ArrayBase, + max_n_bins: usize, + ) -> Result where S: Data, { - let fd_builder = FreedmanDiaconis::from_array(a); - let sturges_builder = Sturges::from_array(a); + let fd_builder = FreedmanDiaconis::from_array_with_max(a, max_n_bins); + let sturges_builder = Sturges::from_array_with_max(a, max_n_bins); match (fd_builder, sturges_builder) { (Err(_), Ok(sturges_builder)) => { let builder = SturgesOrFD::Sturges(sturges_builder); @@ -504,7 +593,7 @@ where impl Auto where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { /// The bin width (or bin length) according to the fitted strategy. pub fn bin_width(&self) -> T { @@ -524,7 +613,7 @@ where /// **Panics** if `n_bins == 0` and division by 0 panics for `T`. fn compute_bin_width(min: T, max: T, n_bins: usize) -> T where - T: Ord + Send + Clone + FromPrimitive + NumOps + Zero, + T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero, { let range = max - min; range / T::from_usize(n_bins).unwrap()