From 95906a09dc5ac777889059fac3d3cab396d4e0f5 Mon Sep 17 00:00:00 2001
From: Rouven Spreckels <rs@qu1x.dev>
Date: Fri, 21 Apr 2023 22:17:36 +0200
Subject: [PATCH] Make `FreedmanDiaconis` strategy more robust.

Use improper IQR and Scott's rule as asymptotic resort.
Introduce maximum number of bins with `u16::MAX` as default.
Compute `n_bins` arithmetically instead of iteratively.
---
 Cargo.toml                  |   2 +-
 RELEASES.md                 |   6 ++
 src/histogram/strategies.rs | 195 ++++++++++++++++++++++++++----------
 3 files changed, 149 insertions(+), 54 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 6a6e345..fe648f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ndarray-histogram"
-version = "0.1.0"
+version = "0.2.0"
 rust-version = "1.60"
 edition = "2021"
 authors = ["Rouven Spreckels <rs@qu1x.dev>"]
diff --git a/RELEASES.md b/RELEASES.md
index 2fdf1bf..3080a30 100644
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -1,3 +1,9 @@
+# Version 0.2.0 (2023-04-21)
+
+  * Make `FreedmanDiaconis` strategy more robust. Use improper IQR and Scott's
+    rule as asymptotic resort. Introduce maximum number of bins with `u16::MAX`
+    as default. Compute `n_bins` arithmetically instead of iteratively.
+
 # Version 0.1.0 (2023-04-13)
 
   * Fix binning strategies by using [`ndarray-slice`], which as well comes with
diff --git a/src/histogram/strategies.rs b/src/histogram/strategies.rs
index 109a9fd..d0566cc 100644
--- a/src/histogram/strategies.rs
+++ b/src/histogram/strategies.rs
@@ -27,8 +27,8 @@
 //!
 //! # Notes
 //!
-//! In general, successful infererence on optimal bin width and number of bins relies on
-//! **variability** of data. In other word, the provided ovservations should not be empty or
+//! In general, successful inference on optimal bin width and number of bins relies on
+//! **variability** of data. In other word, the provided observations should not be empty or
 //! constant.
 //!
 //! In addition, [`Auto`] and [`FreedmanDiaconis`] requires the [`interquartile range (IQR)`][iqr],
@@ -52,7 +52,7 @@ use crate::{
 };
 use ndarray::{prelude::*, Data};
 use noisy_float::types::n64;
-use num_traits::{FromPrimitive, NumOps, Zero};
+use num_traits::{FromPrimitive, NumOps, ToPrimitive, Zero};
 
 /// A trait implemented by all strategies to build [`Bins`] with parameters inferred from
 /// observations.
@@ -66,16 +66,40 @@ use num_traits::{FromPrimitive, NumOps, Zero};
 pub trait BinsBuildingStrategy {
 	#[allow(missing_docs)]
 	type Elem: Ord + Send;
-	/// Returns a strategy that has learnt the required parameter fo building [`Bins`] for given
+	/// Returns a strategy that has learnt the required parameter for building [`Bins`] for given
 	/// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter
 	/// with the given data and specified strategy.
 	///
+	/// Calls [`Self::from_array_with_max`] with `max_n_bins` of [`u16::MAX`].
+	///
 	/// # Errors
 	///
-	/// See each of the struct-level documentation for details on errors an implementor may return.
+	/// See each of the `struct`-level documentation for details on errors an implementation may
+	/// return.
 	///
 	/// [`Bins`]: ../struct.Bins.html
 	fn from_array<S>(array: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
+	where
+		S: Data<Elem = Self::Elem>,
+		Self: std::marker::Sized,
+	{
+		Self::from_array_with_max(array, u16::MAX.into())
+	}
+
+	/// Returns a strategy that has learnt the required parameter for building [`Bins`] for given
+	/// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter
+	/// with the given data and specified strategy.
+	///
+	/// # Errors
+	///
+	/// See each of the `struct`-level documentation for details on errors an implementation may
+	/// return. Fails if the strategy requires more bins than `max_n_bins`.
+	///
+	/// [`Bins`]: ../struct.Bins.html
+	fn from_array_with_max<S>(
+		array: &ArrayBase<S, Ix1>,
+		max_n_bins: usize,
+	) -> Result<Self, BinsBuildError>
 	where
 		S: Data<Elem = Self::Elem>,
 		Self: std::marker::Sized;
@@ -157,13 +181,19 @@ pub struct Sturges<T> {
 
 /// Robust (resilient to outliers) strategy that takes into account data variability and data size.
 ///
-/// Let `n` be the number of observations.
+/// Let `n` be the number of observations and `at = 1 / 4`.
 ///
-/// `bin_width` = 2 × `IQR` × `n`<sup>−1/3</sup>
+/// `bin_width` = (1 - 2 × `at`) × `IQR` × `n`<sup>−1/3</sup>
 ///
-/// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to
-/// cube root of `n`. It can be too conservative for small datasets, but it is quite good for large
-/// datasets.
+/// The bin width is proportional to the interquartile range ([`IQR`]) from `at` to `1 - at` and
+/// inversely proportional to cube root of `n`. It can be too conservative for small datasets, but
+/// it is quite good for large datasets. In case the [`IQR`] is close to zero, `at` is halved and an
+/// improper [`IQR`] is computed. This is repeated as long as `at >= 1 / 512`. If no [`IQR`] is
+/// found by then, Scott's rule is used as asymptotic resort which is based on the standard
+/// deviation (SD). If the SD is close to zero as well, this strategy fails with
+/// [`BinsBuildError::Strategy`]. As there is no one-fit-all epsilon, whether the IQR or standard
+/// deviation is close to zero is indirectly tested by requiring the computed number of bins to not
+/// exceed `max_n_bins` with a default of [`u16::MAX`].
 ///
 /// The [`IQR`] is very robust to outliers.
 ///
@@ -213,7 +243,7 @@ pub struct Auto<T> {
 
 impl<T> EquiSpaced<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	/// Returns `Err(BinsBuildError::Strategy)` if `bin_width<=0` or `min` >= `max`.
 	/// Returns `Ok(Self)` otherwise.
@@ -240,13 +270,10 @@ where
 	}
 
 	fn n_bins(&self) -> usize {
-		let mut max_edge = self.min.clone();
-		let mut n_bins = 0;
-		while max_edge <= self.max {
-			max_edge = max_edge + self.bin_width.clone();
-			n_bins += 1;
-		}
-		n_bins
+		let min = self.min.to_f64().unwrap();
+		let max = self.max.to_f64().unwrap();
+		let bin_width = self.bin_width.to_f64().unwrap();
+		usize::from_f64(((max - min) / bin_width + 0.5).ceil()).unwrap_or(usize::MAX)
 	}
 
 	fn bin_width(&self) -> T {
@@ -256,14 +283,17 @@ where
 
 impl<T> BinsBuildingStrategy for Sqrt<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	type Elem = T;
 
 	/// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
 	/// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
 	/// Returns `Ok(Self)` otherwise.
-	fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
+	fn from_array_with_max<S>(
+		a: &ArrayBase<S, Ix1>,
+		max_n_bins: usize,
+	) -> Result<Self, BinsBuildError>
 	where
 		S: Data<Elem = Self::Elem>,
 	{
@@ -278,7 +308,11 @@ where
 		let max = a.max()?;
 		let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
 		let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
-		Ok(Self { builder })
+		if builder.n_bins() > max_n_bins {
+			Err(BinsBuildError::Strategy)
+		} else {
+			Ok(Self { builder })
+		}
 	}
 
 	fn build(&self) -> Bins<T> {
@@ -292,7 +326,7 @@ where
 
 impl<T> Sqrt<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	/// The bin width (or bin length) according to the fitted strategy.
 	pub fn bin_width(&self) -> T {
@@ -302,14 +336,17 @@ where
 
 impl<T> BinsBuildingStrategy for Rice<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	type Elem = T;
 
 	/// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
 	/// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
 	/// Returns `Ok(Self)` otherwise.
-	fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
+	fn from_array_with_max<S>(
+		a: &ArrayBase<S, Ix1>,
+		max_n_bins: usize,
+	) -> Result<Self, BinsBuildError>
 	where
 		S: Data<Elem = Self::Elem>,
 	{
@@ -324,7 +361,11 @@ where
 		let max = a.max()?;
 		let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
 		let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
-		Ok(Self { builder })
+		if builder.n_bins() > max_n_bins {
+			Err(BinsBuildError::Strategy)
+		} else {
+			Ok(Self { builder })
+		}
 	}
 
 	fn build(&self) -> Bins<T> {
@@ -338,7 +379,7 @@ where
 
 impl<T> Rice<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	/// The bin width (or bin length) according to the fitted strategy.
 	pub fn bin_width(&self) -> T {
@@ -348,14 +389,17 @@ where
 
 impl<T> BinsBuildingStrategy for Sturges<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	type Elem = T;
 
 	/// Returns `Err(BinsBuildError::Strategy)` if the array is constant.
 	/// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
 	/// Returns `Ok(Self)` otherwise.
-	fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
+	fn from_array_with_max<S>(
+		a: &ArrayBase<S, Ix1>,
+		max_n_bins: usize,
+	) -> Result<Self, BinsBuildError>
 	where
 		S: Data<Elem = Self::Elem>,
 	{
@@ -370,7 +414,11 @@ where
 		let max = a.max()?;
 		let bin_width = compute_bin_width(min.clone(), max.clone(), n_bins);
 		let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
-		Ok(Self { builder })
+		if builder.n_bins() > max_n_bins {
+			Err(BinsBuildError::Strategy)
+		} else {
+			Ok(Self { builder })
+		}
 	}
 
 	fn build(&self) -> Bins<T> {
@@ -384,7 +432,7 @@ where
 
 impl<T> Sturges<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	/// The bin width (or bin length) according to the fitted strategy.
 	pub fn bin_width(&self) -> T {
@@ -394,14 +442,27 @@ where
 
 impl<T> BinsBuildingStrategy for FreedmanDiaconis<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	type Elem = T;
 
-	/// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`.
+	/// Returns `Err(BinsBuildError::Strategy)` if improper IQR and SD are close to zero.
 	/// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
 	/// Returns `Ok(Self)` otherwise.
 	fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
+	where
+		S: Data<Elem = Self::Elem>,
+	{
+		Self::from_array_with_max(a, u16::MAX.into())
+	}
+
+	/// Returns `Err(BinsBuildError::Strategy)` if improper IQR and SD are close to zero.
+	/// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
+	/// Returns `Ok(Self)` otherwise.
+	fn from_array_with_max<S>(
+		a: &ArrayBase<S, Ix1>,
+		max_n_bins: usize,
+	) -> Result<Self, BinsBuildError>
 	where
 		S: Data<Elem = Self::Elem>,
 	{
@@ -410,15 +471,48 @@ where
 			return Err(BinsBuildError::EmptyInput);
 		}
 
-		let mut a_copy = a.to_owned();
-		let first_quartile = a_copy.quantile_mut(n64(0.25), &Nearest).unwrap();
-		let third_quartile = a_copy.quantile_mut(n64(0.75), &Nearest).unwrap();
-		let iqr = third_quartile - first_quartile;
-
-		let bin_width = FreedmanDiaconis::compute_bin_width(n_points, iqr);
+		let n_cbrt = f64::from_usize(n_points).unwrap().powf(1. / 3.);
 		let min = a.min()?;
 		let max = a.max()?;
+		let mut a_copy = a.to_owned();
+		// As there is no one-fit-all epsilon to decide whether IQR is zero, translate it into
+		// number of bins and compare it against `max_n_bins`. More bins than `max_n_bins` is a hint
+		// for an IQR close to zero. If so, deviate from proper Freedman-Diaconis rule by widening
+		// percentiles range and try again with `at` of 1/8, 1/16, 1/32, 1/64, 1/128, 1/256, 1/512.
+		let mut at = 0.5;
+		while at >= 1. / 512. {
+			at *= 0.5;
+			let first_quartile = a_copy.quantile_mut(n64(at), &Nearest).unwrap();
+			let third_quartile = a_copy.quantile_mut(n64(1. - at), &Nearest).unwrap();
+			let iqr = third_quartile - first_quartile;
+			let denom = T::from_f64((1. - 2. * at) * n_cbrt).unwrap();
+			if denom == T::zero() {
+				continue;
+			}
+			let bin_width = iqr.clone() / denom;
+			let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
+			if builder.n_bins() > max_n_bins {
+				continue;
+			}
+			return Ok(Self { builder });
+		}
+		// If the improper IQR is still close to zero, use Scott's rule as asymptotic resort before
+		// giving up where `m` is the mean and `s` its SD.
+		let m = a.iter().cloned().fold(T::zero(), |s, v| s + v) / T::from_usize(n_points).unwrap();
+		let s = a
+			.iter()
+			.cloned()
+			.map(|v| (v.clone() - m.clone()) * (v - m.clone()))
+			.fold(T::zero(), |s, v| s + v);
+		let s = (s / T::from_usize(n_points - 1).unwrap())
+			.to_f64()
+			.unwrap()
+			.sqrt();
+		let bin_width = T::from_f64(3.49 * s).unwrap() / T::from_f64(n_cbrt).unwrap();
 		let builder = EquiSpaced::new(bin_width, min.clone(), max.clone())?;
+		if builder.n_bins() > max_n_bins {
+			return Err(BinsBuildError::Strategy);
+		}
 		Ok(Self { builder })
 	}
 
@@ -433,16 +527,8 @@ where
 
 impl<T> FreedmanDiaconis<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
-	fn compute_bin_width(n_bins: usize, iqr: T) -> T {
-		// casting `n_bins: usize` to `f64` may casus off-by-one error here if `n_bins` > 2 ^ 53,
-		// but it's not relevant here
-		#[allow(clippy::cast_precision_loss)]
-		let denominator = (n_bins as f64).powf(1. / 3.);
-		T::from_usize(2).unwrap() * iqr / T::from_f64(denominator).unwrap()
-	}
-
 	/// The bin width (or bin length) according to the fitted strategy.
 	pub fn bin_width(&self) -> T {
 		self.builder.bin_width()
@@ -451,19 +537,22 @@ where
 
 impl<T> BinsBuildingStrategy for Auto<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	type Elem = T;
 
 	/// Returns `Err(BinsBuildError::Strategy)` if `IQR==0`.
 	/// Returns `Err(BinsBuildError::EmptyInput)` if `a.len()==0`.
 	/// Returns `Ok(Self)` otherwise.
-	fn from_array<S>(a: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
+	fn from_array_with_max<S>(
+		a: &ArrayBase<S, Ix1>,
+		max_n_bins: usize,
+	) -> Result<Self, BinsBuildError>
 	where
 		S: Data<Elem = Self::Elem>,
 	{
-		let fd_builder = FreedmanDiaconis::from_array(a);
-		let sturges_builder = Sturges::from_array(a);
+		let fd_builder = FreedmanDiaconis::from_array_with_max(a, max_n_bins);
+		let sturges_builder = Sturges::from_array_with_max(a, max_n_bins);
 		match (fd_builder, sturges_builder) {
 			(Err(_), Ok(sturges_builder)) => {
 				let builder = SturgesOrFD::Sturges(sturges_builder);
@@ -504,7 +593,7 @@ where
 
 impl<T> Auto<T>
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	/// The bin width (or bin length) according to the fitted strategy.
 	pub fn bin_width(&self) -> T {
@@ -524,7 +613,7 @@ where
 /// **Panics** if `n_bins == 0` and division by 0 panics for `T`.
 fn compute_bin_width<T>(min: T, max: T, n_bins: usize) -> T
 where
-	T: Ord + Send + Clone + FromPrimitive + NumOps + Zero,
+	T: Ord + Send + Clone + FromPrimitive + ToPrimitive + NumOps + Zero,
 {
 	let range = max - min;
 	range / T::from_usize(n_bins).unwrap()