From 11192a22cfb7f880a17da517ae53b74e7a66bc15 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Nov 2024 21:04:49 +0100 Subject: [PATCH 1/6] Implement PixelSelector::describe() This is supposed to be similar to the describe method from scipy.stats. The following statistics are supported: - nnz - sum - min - max - mean - variance - skewness - kurtosis All statistics are computed using non-zero pixels. If you need statistics including zeros, you are better off fetching interactions as a sparse matrix with `sel.to_csr()` and computing the required statistics using the sparse matrix. The main advantage of using hictkpy's describe() instead of other methods, is that all statistics are computed (or estimated) by traversing the data only once (and without caching pixels). All statistics except variance, skewness, and kurtosis are guaranteed to be exact. Variance, skewness, and kurtosis are not exact because they are estimated using the accumulator library from Boost. However, in practice, the estimation is usually very accurate (based on my tests, the rel. error is always < 1.0e-4, and typically < 1.0e-6). The estimates can be inaccurate when the sample size is very small. For the time being, working around this issue is the useri's responsibility. Example: ```python3 f = hictk.File("test.cool") stats = ss.describe(list(p.count for p in f.fetch())) ``` Another important feature of hictkpy's describe(), is the ability of recognizing scenarios where the required statistics can be computed without traversing all pixels overlapping with the given query. For example, as soon as the first pixel with a NaN count is encountered, we can stop updating the estimates for all statistics except nnz. This means that if we do not have to compute nnz, then describe() can return as soon as the first NaN pixel is found. If we have to compute nnz, then we must traverse all pixels. However, we can still reduce the amount of work performed by describe() by taking advantage of the fact that we only need to count pixels. We recommend using describe() when computing multiple statistics at the same time. When computing a single statistic we recommend using one of nnz(), sum(), min(), max(), mean(), variance(), skewness(), or kurtosis(). All methods computing stats from a PixelSelector accept a keep_nans and keep_infs params which can be used to customize how non-finite values are handled. --- src/CMakeLists.txt | 4 + src/cooler_file_writer.cpp | 5 +- src/include/hictkpy/common.hpp | 10 + src/include/hictkpy/pixel_aggregator.hpp | 114 +++++++ src/include/hictkpy/pixel_selector.hpp | 26 +- src/pixel_aggregator_impl.hpp | 405 +++++++++++++++++++++++ src/pixel_selector.cpp | 186 ++++++++--- 7 files changed, 702 insertions(+), 48 deletions(-) create mode 100644 src/include/hictkpy/pixel_aggregator.hpp create mode 100644 src/pixel_aggregator_impl.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b31d20f..17fd4f0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,8 +13,10 @@ find_package( ) find_package(Arrow REQUIRED) +find_package(Boost REQUIRED) find_package(FMT REQUIRED) find_package(nanobind REQUIRED) +find_package(phmap REQUIRED) find_package(spdlog REQUIRED) find_package(Filesystem REQUIRED) @@ -55,7 +57,9 @@ target_link_libraries( hictk::hic hictk::transformers Arrow::arrow_$,shared,static> + Boost::headers fmt::fmt-header-only + phmap spdlog::spdlog_header_only std::filesystem ) diff --git a/src/cooler_file_writer.cpp b/src/cooler_file_writer.cpp index 21d343e..f432f63 100644 --- a/src/cooler_file_writer.cpp +++ b/src/cooler_file_writer.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -89,7 +88,7 @@ void CoolerFileWriter::add_pixels(const nb::object &df) { std::visit( [&](const auto &n) { - using N = hictk::remove_cvref_t; + using N = remove_cvref_t; const auto pixels = coo_format ? coo_df_to_thin_pixels(df, true) : bg2_df_to_thin_pixels(_w->bins(), df, true); @@ -126,7 +125,7 @@ void CoolerFileWriter::finalize([[maybe_unused]] std::string_view log_lvl_str, try { std::visit( [&](const auto &num) { - using N = hictk::remove_cvref_t; + using N = remove_cvref_t; _w->aggregate(_path.string(), false, _compression_lvl, chunk_size, update_freq); }, _w->open("0").pixel_variant()); diff --git a/src/include/hictkpy/common.hpp b/src/include/hictkpy/common.hpp index 1079ce3..e8822de 100644 --- a/src/include/hictkpy/common.hpp +++ b/src/include/hictkpy/common.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -13,8 +14,17 @@ namespace hictkpy { +#define HICTKPY_LIKELY HICTK_LIKELY +#define HICTKPY_UNLIKELY HICTK_UNLIKELY + [[noreturn]] inline void unreachable_code() { hictk::unreachable_code(); } +template +using remove_cvref = hictk::remove_cvref; + +template +using remove_cvref_t = typename remove_cvref::type; + template [[nodiscard]] constexpr std::string_view map_type_to_dtype() { if constexpr (std::is_unsigned_v) { diff --git a/src/include/hictkpy/pixel_aggregator.hpp b/src/include/hictkpy/pixel_aggregator.hpp new file mode 100644 index 0000000..1054e92 --- /dev/null +++ b/src/include/hictkpy/pixel_aggregator.hpp @@ -0,0 +1,114 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hictkpy { + +struct Stats { + std::optional nnz{}; + std::optional> sum{}; + std::optional> min{}; + std::optional> max{}; + std::optional mean{}; + std::optional variance{}; + std::optional skewness{}; + std::optional kurtosis{}; +}; + +class PixelAggregator { + template + using Accumulator = boost::accumulators::accumulator_set< + N, boost::accumulators::stats< + boost::accumulators::droppable, + boost::accumulators::droppable, + boost::accumulators::droppable, + boost::accumulators::droppable, + boost::accumulators::droppable, + boost::accumulators::droppable, + boost::accumulators::droppable, + boost::accumulators::droppable>>; + + // When using ints there is a high chance that computing the 3rd or 4th moments causes an int + // overflow, leading to completely incorrect results. + // At the same time we cannot use doubles everywhere, because for large numbers the value of + // e.g. sum, mean etc. will no longer be exact + using KurtosisAccumulator = boost::accumulators::accumulator_set< + double, boost::accumulators::stats>; + + std::variant, Accumulator> _accumulator{ + Accumulator{}}; + std::optional _kurtosis_accumulator{}; + + bool _compute_count{true}; + bool _compute_sum{true}; + bool _compute_min{true}; + bool _compute_max{true}; + bool _compute_mean{true}; + bool _compute_variance{true}; + bool _compute_skewness{true}; + bool _compute_kurtosis{true}; + + bool _nan_found{false}; + bool _neg_inf_found{false}; + bool _pos_inf_found{false}; + + public: + static constexpr std::array valid_metrics{ + "nnz", "sum", "min", "max", "mean", "variance", "skewness", "kurtosis"}; + + PixelAggregator() = default; + + template + [[nodiscard]] Stats compute(const PixelSelector& sel, + const phmap::flat_hash_set& metrics); + + private: + static void validate_metrics(const phmap::flat_hash_set& metrics); + + template + void process_non_finite(N n) noexcept; + template + [[nodiscard]] It process_pixels_until_true(Accumulator& accumulator, It first, It last, + StopCondition break_loop); + + template + [[nodiscard]] bool sum_can_be_skipped(Accumulator& accumulator) const; + template + [[nodiscard]] bool min_can_be_skipped(Accumulator& accumulator) const; + template + [[nodiscard]] bool max_can_be_skipped(Accumulator& accumulator) const; + template + [[nodiscard]] bool mean_can_be_skipped(Accumulator& accumulator) const; + template + void disable_redundant_accumulators(Accumulator& accumulator); + + template + void process_all_remaining_pixels(Accumulator& accumulator, It&& first, It&& last); + + template + static bool drop_value(N n) noexcept; + + template + void reset(const phmap::flat_hash_set& metrics); + + template + [[nodiscard]] Stats extract(const phmap::flat_hash_set& metrics); +}; + +} // namespace hictkpy + +#include "../../pixel_aggregator_impl.hpp" diff --git a/src/include/hictkpy/pixel_selector.hpp b/src/include/hictkpy/pixel_selector.hpp index 890a3b2..a1c3c33 100644 --- a/src/include/hictkpy/pixel_selector.hpp +++ b/src/include/hictkpy/pixel_selector.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include "hictkpy/nanobind.hpp" @@ -55,14 +56,23 @@ struct PixelSelector { [[nodiscard]] auto get_coord2() const -> PixelCoordTuple; [[nodiscard]] nanobind::iterator make_iterable() const; - [[nodiscard]] nanobind::object to_arrow(std::string_view span = "upper_triangle") const; - [[nodiscard]] nanobind::object to_df(std::string_view span = "upper_triangle") const; - [[nodiscard]] nanobind::object to_pandas(std::string_view span = "upper_triangle") const; - [[nodiscard]] nanobind::object to_coo(std::string_view span = "upper_triangle") const; - [[nodiscard]] nanobind::object to_csr(std::string_view span = "upper_triangle") const; - [[nodiscard]] nanobind::object to_numpy(std::string_view span = "full") const; - [[nodiscard]] nanobind::object sum() const; - [[nodiscard]] std::int64_t nnz() const; + [[nodiscard]] nanobind::object to_arrow(std::string_view span) const; + [[nodiscard]] nanobind::object to_df(std::string_view span) const; + [[nodiscard]] nanobind::object to_pandas(std::string_view span) const; + [[nodiscard]] nanobind::object to_coo(std::string_view span) const; + [[nodiscard]] nanobind::object to_csr(std::string_view span) const; + [[nodiscard]] nanobind::object to_numpy(std::string_view span) const; + + [[nodiscard]] nanobind::dict describe(const std::vector& metrics, bool keep_nans, + bool keep_infs) const; + [[nodiscard]] std::int64_t nnz(bool keep_nans, bool keep_infs) const; + [[nodiscard]] nanobind::object sum(bool keep_nans, bool keep_infs) const; + [[nodiscard]] nanobind::object min(bool keep_nans = false, bool keep_infs = false) const; + [[nodiscard]] nanobind::object max(bool keep_nans = false, bool keep_infs = false) const; + [[nodiscard]] double mean(bool keep_nans = false, bool keep_infs = false) const; + [[nodiscard]] double variance(bool keep_nans = false, bool keep_infs = false) const; + [[nodiscard]] double skewness(bool keep_nans = false, bool keep_infs = false) const; + [[nodiscard]] double kurtosis(bool keep_nans = false, bool keep_infs = false) const; [[nodiscard]] static auto parse_span(std::string_view span) -> QuerySpan; [[nodiscard]] static auto parse_count_type(std::string_view type) -> PixelVar; diff --git a/src/pixel_aggregator_impl.hpp b/src/pixel_aggregator_impl.hpp new file mode 100644 index 0000000..793f5ce --- /dev/null +++ b/src/pixel_aggregator_impl.hpp @@ -0,0 +1,405 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace hictkpy { + +template +inline Stats PixelAggregator::compute(const PixelSelector& sel, + const phmap::flat_hash_set& metrics) { + static_assert(std::is_same_v || std::is_same_v); + + validate_metrics(metrics); + + reset(metrics); + auto first = sel.template begin(); + auto last = sel.template end(); + + auto break_on_non_finite = [this]() constexpr noexcept { + return _nan_found || _neg_inf_found || _pos_inf_found; + }; + auto break_on_nans = [this]() constexpr noexcept { return _nan_found; }; + + std::visit( + [&](auto& accumulator) { + if (_compute_kurtosis) { + constexpr bool skip_kurtosis = false; + first = process_pixels_until_true( + accumulator, std::move(first), last, break_on_non_finite); + } else { + constexpr bool skip_kurtosis = true; + first = process_pixels_until_true( + accumulator, std::move(first), last, break_on_non_finite); + } + if (first == last) { + return; + } + + const auto skip_sum = sum_can_be_skipped(accumulator); + const auto skip_min = min_can_be_skipped(accumulator); + const auto skip_max = max_can_be_skipped(accumulator); + const auto skip_mean = mean_can_be_skipped(accumulator); + disable_redundant_accumulators(accumulator); + + if (_compute_count) { + // if we need to compute the nnz, then we have no tricks up our sleeves + process_all_remaining_pixels(accumulator, std::move(first), + std::move(last)); + return; + } + + if (skip_sum && skip_min && skip_max && skip_mean) { + return; + } + + constexpr bool skip_kurtosis = true; + std::ignore = process_pixels_until_true( + accumulator, std::move(first), std::move(last), break_on_nans); + }, + _accumulator); + + return extract(metrics); +} + +inline void PixelAggregator::validate_metrics(const phmap::flat_hash_set& metrics) { + for (const auto& metric : metrics) { + if (std::find(valid_metrics.begin(), valid_metrics.end(), metric) == valid_metrics.end()) { + throw std::out_of_range( + fmt::format(FMT_STRING("unknown metric \"{}\". Valid metrics are: {}"), metric, + fmt::join(valid_metrics, ", "))); + } + } +} + +template +inline void PixelAggregator::process_non_finite(N n) noexcept { + static_assert(std::is_arithmetic_v); + if constexpr (std::is_integral_v) { + return; + } + + if constexpr (keep_nans) { + if (HICTKPY_UNLIKELY(std::isnan(n))) { + _nan_found = true; + return; + } + } + + if constexpr (keep_infs) { + if (HICTKPY_UNLIKELY(std::isinf(n))) { + if (n > 0) { + _pos_inf_found = true; + } else { + _neg_inf_found = true; + } + } + } +} + +template +inline It PixelAggregator::process_pixels_until_true(Accumulator& accumulator, It first, It last, + StopCondition break_loop) { + // This is just a workaround to allow wrapping drop_value and early_return with HICTKPY_UNLIKELY + auto drop_pixel = [this](const auto n) noexcept { return drop_value(n); }; + + while (first != last) { + const auto n = first->count; + if (HICTKPY_UNLIKELY(drop_pixel(n))) { + std::ignore = ++first; + continue; + } + process_non_finite(n); + if (HICTKPY_UNLIKELY(break_loop())) { + break; + } + accumulator(n); + + if constexpr (!skip_kurtosis && std::is_integral_v) { + assert(_kurtosis_accumulator.has_value()); + (*_kurtosis_accumulator)(static_cast(n)); + } + std::ignore = ++first; + } + return first; +} + +template +inline bool PixelAggregator::sum_can_be_skipped(Accumulator& accumulator) const { + if (_compute_sum && (_nan_found || (_neg_inf_found && _pos_inf_found))) { + accumulator.template drop(); + return true; + } + return false; +} + +template +inline bool PixelAggregator::min_can_be_skipped(Accumulator& accumulator) const { + if (_compute_min && _nan_found) { + accumulator.template drop(); + return true; + } + return false; +} + +template +inline bool PixelAggregator::max_can_be_skipped(Accumulator& accumulator) const { + if (_compute_max && _nan_found) { + accumulator.template drop(); + return true; + } + return false; +} + +template +inline bool PixelAggregator::mean_can_be_skipped(Accumulator& accumulator) const { + if (_compute_mean && (_nan_found || (_neg_inf_found && _pos_inf_found))) { + accumulator.template drop(); + return true; + } + return false; +} + +template +inline void PixelAggregator::disable_redundant_accumulators(Accumulator& accumulator) { + if (_compute_variance) { + accumulator.template drop(); + } + if (_compute_skewness) { + accumulator.template drop(); + } + if (_compute_kurtosis) { + accumulator.template drop(); + _kurtosis_accumulator.reset(); + } +} + +template +inline void PixelAggregator::process_all_remaining_pixels(Accumulator& accumulator, It&& first, + It&& last) { + assert(_compute_count); + // This is just a workaround to allow wrapping drop_value and early_return with HICTKPY_UNLIKELY + auto drop_pixel = [this](const auto n) noexcept { return drop_value(n); }; + + std::for_each(std::forward(first), std::forward(last), [&](const auto& pixel) { + if (HICTKPY_UNLIKELY(drop_pixel(pixel.count))) { + return; + } + process_non_finite(pixel.count); + accumulator(pixel.count); + }); +} + +template +inline bool PixelAggregator::drop_value([[maybe_unused]] N n) noexcept { + static_assert(std::is_arithmetic_v); + if constexpr (!std::is_floating_point_v) { + return false; + } + + if constexpr (!keep_nans && !keep_infs) { + return !std::isfinite(n); + } + if constexpr (!keep_nans) { + return std::isnan(n); + } + if constexpr (!keep_infs) { + return std::isinf(n); + } + + return false; +} + +template +inline void PixelAggregator::reset(const phmap::flat_hash_set& metrics) { + if (metrics.empty()) { + throw std::runtime_error("provide one or more statistics to be computed"); + } + + _compute_count = true; + _compute_sum = true; + _compute_min = true; + _compute_max = true; + _compute_mean = true; + _compute_variance = true; + _compute_skewness = true; + _compute_kurtosis = true; + + _nan_found = false; + _neg_inf_found = false; + _pos_inf_found = false; + + if constexpr (std::is_floating_point_v) { + _accumulator = Accumulator{}; + _kurtosis_accumulator.reset(); + } else { + _accumulator = Accumulator{}; + if (metrics.contains("kurtosis")) { + _kurtosis_accumulator = KurtosisAccumulator{}; + } else { + _kurtosis_accumulator.reset(); + _compute_kurtosis = false; + } + } + + std::visit( + [&](auto& accumulator) { + if (!metrics.contains("nnz")) { + accumulator.template drop(); + _compute_count = false; + } + + if (!metrics.contains("sum")) { + accumulator.template drop(); + _compute_sum = false; + } + + if (!metrics.contains("min")) { + accumulator.template drop(); + _compute_min = false; + } + + if (!metrics.contains("max")) { + accumulator.template drop(); + _compute_max = false; + } + + if (!metrics.contains("mean")) { + accumulator.template drop(); + _compute_mean = false; + } + + if (!metrics.contains("variance")) { + accumulator.template drop(); + _compute_variance = false; + } + + if (!metrics.contains("skewness")) { + accumulator.template drop(); + _compute_skewness = false; + } + + if (!metrics.contains("kurtosis") || std::is_integral_v) { + accumulator.template drop(); + _compute_kurtosis = metrics.contains("kurtosis"); + } + }, + _accumulator); +} + +template +inline Stats PixelAggregator::extract(const phmap::flat_hash_set& metrics) { + assert(!metrics.empty()); + + Stats stats{}; + + std::visit( + [&](auto& accumulator) { + if (metrics.contains("nnz")) { + stats.nnz = boost::accumulators::count(accumulator); + } + + if (metrics.contains("sum")) { + if constexpr (std::is_floating_point_v) { + if (_nan_found || (_neg_inf_found && _pos_inf_found)) { + stats.sum = std::numeric_limits::quiet_NaN(); + } else if (_pos_inf_found) { + stats.sum = std::numeric_limits::infinity(); + } else if (_neg_inf_found) { + stats.sum = -std::numeric_limits::infinity(); + } + } + if (!stats.sum.has_value()) { + stats.sum = boost::accumulators::sum(accumulator); + } + } + + if (metrics.contains("min")) { + if constexpr (std::is_floating_point_v) { + if (_nan_found) { + stats.min = std::numeric_limits::quiet_NaN(); + } else if (_neg_inf_found) { + stats.min = -std::numeric_limits::infinity(); + } + } + if (!stats.min.has_value()) { + stats.min = boost::accumulators::min(accumulator); + } + } + + if (metrics.contains("max")) { + if constexpr (std::is_floating_point_v) { + if (_nan_found) { + stats.max = std::numeric_limits::quiet_NaN(); + } else if (_pos_inf_found) { + stats.max = std::numeric_limits::infinity(); + } + } + if (!stats.max.has_value()) { + stats.max = boost::accumulators::max(accumulator); + } + } + + if (metrics.contains("mean")) { + if (_nan_found || (_neg_inf_found && _pos_inf_found)) { + stats.mean = std::numeric_limits::quiet_NaN(); + } else if (_pos_inf_found) { + stats.mean = std::numeric_limits::infinity(); + } else if (_neg_inf_found) { + stats.mean = -std::numeric_limits::infinity(); + } else { + stats.mean = boost::accumulators::mean(accumulator); + } + } + + if (metrics.contains("variance")) { + if (_nan_found || _pos_inf_found || _neg_inf_found) { + stats.variance = std::numeric_limits::quiet_NaN(); + } else { + stats.variance = boost::accumulators::variance(accumulator); + } + } + + if (metrics.contains("skewness")) { + if (_nan_found || _pos_inf_found || _neg_inf_found) { + stats.skewness = std::numeric_limits::quiet_NaN(); + } else { + stats.skewness = boost::accumulators::skewness(accumulator); + } + } + + if (metrics.contains("kurtosis")) { + if (_nan_found || _pos_inf_found || _neg_inf_found) { + stats.kurtosis = std::numeric_limits::quiet_NaN(); + } else if (_kurtosis_accumulator.has_value()) { + stats.kurtosis = boost::accumulators::kurtosis(*_kurtosis_accumulator); + } else { + stats.kurtosis = boost::accumulators::kurtosis(accumulator); + } + } + }, + _accumulator); + + return stats; +} + +}; // namespace hictkpy diff --git a/src/pixel_selector.cpp b/src/pixel_selector.cpp index bc8518b..f7d6769 100644 --- a/src/pixel_selector.cpp +++ b/src/pixel_selector.cpp @@ -9,6 +9,8 @@ #include #include +#include +#include #include #include @@ -17,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -32,8 +34,11 @@ #include #include #include +#include +#include "hictkpy/common.hpp" #include "hictkpy/nanobind.hpp" +#include "hictkpy/pixel_aggregator.hpp" #include "hictkpy/pixel_selector.hpp" #include "hictkpy/to_pyarrow.hpp" @@ -273,52 +278,125 @@ nb::object PixelSelector::to_numpy(std::string_view span) const { } template -[[nodiscard]] static nb::object compute_pixel_sum(const PixelSelector& sel) { - return nb::cast(std::accumulate( - sel.template begin(), sel.template end(), N{0}, - [](N accumulator, const hictk::ThinPixel& tp) { return accumulator + tp.count; })); +[[nodiscard]] static Stats aggregate_pixels(const PixelSelector& sel, bool keep_nans, + bool keep_infs, + const phmap::flat_hash_set& metrics) { + static_assert(!std::is_same_v); + if (keep_nans && keep_infs) { + return PixelAggregator{}.compute(sel, metrics); + } + if (keep_nans) { + return PixelAggregator{}.compute(sel, metrics); + } + if (keep_infs) { + return PixelAggregator{}.compute(sel, metrics); + } + return PixelAggregator{}.compute(sel, metrics); } -nb::object PixelSelector::sum() const { - const bool fp_sum = std::visit( - [&]([[maybe_unused]] const auto count) { return std::is_floating_point_v; }, - pixel_count); - - const bool unsigned_sum = std::visit( - [&]([[maybe_unused]] const auto count) { return std::is_unsigned_v; }, - pixel_count); - +[[nodiscard]] static Stats aggregate_pixels(const PixelSelector::SelectorVar& sel, + const PixelSelector::PixelVar& count, bool keep_nans, + bool keep_infs, + const phmap::flat_hash_set& metrics) { return std::visit( - [&](const auto& sel_ptr) -> nb::object { - if (fp_sum) { - return compute_pixel_sum(*sel_ptr); - } - if (unsigned_sum) { - return compute_pixel_sum(*sel_ptr); - } - return compute_pixel_sum(*sel_ptr); + [&](const auto& sel_ptr) { + assert(sel_ptr); + return std::visit( + [&]([[maybe_unused]] const auto& count_) { + using CountT = remove_cvref_t; + using N = std::conditional_t, double, std::int64_t>; + return aggregate_pixels(*sel_ptr, keep_nans, keep_infs, metrics); + }, + count); }, - selector); + sel); } -std::int64_t PixelSelector::nnz() const { - return std::visit( - [&](const auto& s) { - using T = std::int_fast8_t; - return std::distance(s->template begin(), s->template end()); - }, - selector); +nb::dict PixelSelector::describe(const std::vector& metrics, bool keep_nans, + bool keep_infs) const { + const auto stats = aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, + {metrics.begin(), metrics.end()}); + + using StatsDict = + nanobind::typed>; + + StatsDict stats_py{}; + + if (stats.nnz) { + stats_py["nnz"] = *stats.nnz; + } + if (stats.sum) { + stats_py["sum"] = + std::visit([](const auto n) -> nb::object { return nb::cast(n); }, *stats.sum); + } + if (stats.min) { + stats_py["min"] = + std::visit([](const auto n) -> nb::object { return nb::cast(n); }, *stats.min); + } + if (stats.max) { + stats_py["max"] = + std::visit([](const auto n) -> nb::object { return nb::cast(n); }, *stats.max); + } + if (stats.mean) { + stats_py["mean"] = *stats.mean; + } + if (stats.variance) { + stats_py["variance"] = *stats.variance; + } + if (stats.skewness) { + stats_py["skewness"] = *stats.skewness; + } + if (stats.kurtosis) { + stats_py["kurtosis"] = *stats.kurtosis; + } + + return stats_py; +} + +std::int64_t PixelSelector::nnz(bool keep_nans, bool keep_infs) const { + return *aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"nnz"}).nnz; +} + +nb::object PixelSelector::sum(bool keep_nans, bool keep_infs) const { + const auto stats = aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"sum"}); + return std::visit([](const auto n) -> nb::object { return nb::cast(n); }, *stats.sum); +} + +nb::object PixelSelector::min(bool keep_nans, bool keep_infs) const { + const auto stats = aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"min"}); + return std::visit([](const auto n) -> nb::object { return nb::cast(n); }, *stats.min); +} + +nb::object PixelSelector::max(bool keep_nans, bool keep_infs) const { + const auto stats = aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"max"}); + return std::visit([](const auto n) -> nb::object { return nb::cast(n); }, *stats.max); } -hictk::transformers::QuerySpan PixelSelector::parse_span(std::string_view span) { +double PixelSelector::mean(bool keep_nans, bool keep_infs) const { + return *aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"mean"}).mean; +} + +double PixelSelector::variance(bool keep_nans, bool keep_infs) const { + return *aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"variance"}).variance; +} + +double PixelSelector::skewness(bool keep_nans, bool keep_infs) const { + return *aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"skewness"}).skewness; +} + +double PixelSelector::kurtosis(bool keep_nans, bool keep_infs) const { + return *aggregate_pixels(selector, pixel_count, keep_nans, keep_infs, {"kurtosis"}).kurtosis; +} + +auto PixelSelector::parse_span(std::string_view span) -> QuerySpan { if (span == "upper_triangle") { - return hictk::transformers::QuerySpan::upper_triangle; + return QuerySpan::upper_triangle; } if (span == "lower_triangle") { - return hictk::transformers::QuerySpan::lower_triangle; + return QuerySpan::lower_triangle; } if (span == "full") { - return hictk::transformers::QuerySpan::full; + return QuerySpan::full; } throw std::runtime_error( @@ -327,7 +405,7 @@ hictk::transformers::QuerySpan PixelSelector::parse_span(std::string_view span) span)); } -hictk::internal::NumericVariant PixelSelector::parse_count_type(std::string_view type) { +auto PixelSelector::parse_count_type(std::string_view type) -> PixelVar { return map_dtype_to_type(type); } @@ -415,10 +493,44 @@ void PixelSelector::bind(nb::module_& m) { nb::sig("def to_csr(self, query_span: str = \"upper_triangle\") -> scipy.sparse.csr_matrix"), "Retrieve interactions as a SciPy CSR matrix.", nb::rv_policy::move); - sel.def("nnz", &PixelSelector::nnz, + static const std::vector known_metrics(PixelAggregator::valid_metrics.begin(), + PixelAggregator::valid_metrics.end()); + static const auto describe_cmd_help = + fmt::format(FMT_STRING("Compute one or more descriptive metrics in the most efficient " + "way possible. Known metrics: {}."), + fmt::join(known_metrics, ", ")); + sel.def("describe", &PixelSelector::describe, nb::arg("metrics") = known_metrics, + nb::arg("keep_nans") = false, nb::arg("keep_infs") = false, describe_cmd_help.c_str()); + sel.def("nnz", &PixelSelector::nnz, nb::arg("keep_nans") = false, nb::arg("keep_infs") = false, "Get the number of non-zero entries for the current pixel selection."); - sel.def("sum", &PixelSelector::sum, nb::sig("def sum(self) -> int | float"), + sel.def("sum", &PixelSelector::sum, nb::arg("keep_nans") = false, nb::arg("keep_infs") = false, + nb::sig("def sum(self, keep_nans: bool = False, keep_infs: bool = False) -> int | float"), "Get the total number of interactions for the current pixel selection."); + sel.def("min", &PixelSelector::min, nb::arg("keep_nans") = false, nb::arg("keep_infs") = false, + nb::sig("def min(self, keep_nans: bool = False, keep_infs: bool = False) -> int | float"), + "Get the minimum number of interactions for the current pixel selection (excluding " + "pixels with no interactions)."); + sel.def("max", &PixelSelector::max, nb::arg("keep_nans") = false, nb::arg("keep_infs") = false, + nb::sig("def max(self, keep_nans: bool = False, keep_infs: bool = False) -> int | float"), + "Get the maximum number of interactions for the current pixel selection."); + sel.def("mean", &PixelSelector::mean, nb::arg("keep_nans") = false, nb::arg("keep_infs") = false, + "Get the average number of interactions for the current pixel selection (excluding " + "pixels with no interactions."); + sel.def( + "variance", &PixelSelector::variance, nb::arg("keep_nans") = false, + nb::arg("keep_infs") = false, + "Get the variance of the number of interactions for the current pixel selection (excluding " + "pixels with no interactions)."); + sel.def( + "skewness", &PixelSelector::skewness, nb::arg("keep_nans") = false, + nb::arg("keep_infs") = false, + "Get the skewness of the number of interactions for the current pixel selection (excluding " + "pixels with no interactions)."); + sel.def( + "kurtosis", &PixelSelector::kurtosis, nb::arg("keep_nans") = false, + nb::arg("keep_infs") = false, + "Get the kurtosis of the number of interactions for the current pixel selection (excluding " + "pixels with no interactions)."); } } // namespace hictkpy From 3a77a2bf9e331407f1b6f2b22472912f346e3be9 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Nov 2024 21:39:25 +0100 Subject: [PATCH 2/6] Add tests --- test/test_fetch_describe.py | 261 ++++++++++++++++++++++++++++++++++++ test/test_fetch_kurtosis.py | 85 ++++++++++++ test/test_fetch_max.py | 88 ++++++++++++ test/test_fetch_mean.py | 77 +++++++++++ test/test_fetch_min.py | 90 +++++++++++++ test/test_fetch_nnz.py | 49 ++++++- test/test_fetch_skewness.py | 85 ++++++++++++ test/test_fetch_sum.py | 54 +++++++- test/test_fetch_variance.py | 84 ++++++++++++ 9 files changed, 870 insertions(+), 3 deletions(-) create mode 100644 test/test_fetch_describe.py create mode 100644 test/test_fetch_kurtosis.py create mode 100644 test/test_fetch_max.py create mode 100644 test/test_fetch_mean.py create mode 100644 test/test_fetch_min.py create mode 100644 test/test_fetch_skewness.py create mode 100644 test/test_fetch_variance.py diff --git a/test/test_fetch_describe.py b/test/test_fetch_describe.py new file mode 100644 index 0000000..6666420 --- /dev/null +++ b/test/test_fetch_describe.py @@ -0,0 +1,261 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib +import tempfile +from math import inf, isnan + +import pytest + +import hictkpy + +from .helpers import numpy_avail, pandas_avail + + +def isclose(n1, n2) -> bool: + import math + + return math.isclose(n1, n2, rel_tol=1.0e-3, abs_tol=1.0e-3) + + +@pytest.mark.skipif(not numpy_avail() or not pandas_avail(), reason="either numpy or pandas are not available") +class TestClass: + @staticmethod + def generate_pixels(insert_nan: bool = False, insert_neg_inf: bool = False, insert_pos_inf: bool = False): + import numpy as np + import pandas as pd + + chroms = {"chr1": 1000} + resolution = 10 + num_bins = chroms["chr1"] // resolution + + bins = list(range(0, num_bins)) + + bin1_ids = [] + bin2_ids = [] + + # generate coordinates + for i in bins: + bin1_ids.extend([i] * len(bins)) + bin2_ids.extend(bins) + + bin1_ids = np.array(bin1_ids) + bin2_ids = np.array(bin2_ids) + + # drop coordinates overlapping with the lower-triangular matrix + mask = bin1_ids <= bin1_ids + bin1_ids = bin1_ids[mask] + bin2_ids = bin2_ids[mask] + + # make matrix sparse + bin1_ids = bin1_ids[::2] + bin2_ids = bin2_ids[::2] + + counts = [i + 0.123 for i, _ in enumerate(bin1_ids)] + + if insert_nan: + counts[-1] = np.nan + if insert_neg_inf: + counts[-2] = -np.inf + if insert_pos_inf: + counts[-3] = np.inf + + return chroms, resolution, pd.DataFrame({"bin1_id": bin1_ids, "bin2_id": bin2_ids, "count": counts}) + + @staticmethod + def make_cooler_file(chroms, resolution, pixels, tmpdir) -> hictkpy.File: + with tempfile.NamedTemporaryFile(dir=tmpdir, suffix=".cool") as tmpfile: + path = pathlib.Path(tmpfile.name) + + w = hictkpy.cooler.FileWriter(path, chroms, resolution) + w.add_pixels(pixels) + w.finalize() + + return hictkpy.File(path, resolution) + + def test_describe_all_finite(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=False, insert_pos_inf=False), tmpdir + ) + + stats = f.fetch(count_type="float").describe(keep_nans=True, keep_infs=True) + + assert stats.get("nnz", -1) == 5_000 + assert isclose(stats.get("sum", -1), 12498115.0) + assert isclose(stats.get("min", -1), 0.123) + assert isclose(stats.get("max", -1), 4999.123) + assert isclose(stats.get("mean", -1), 2499.623) + assert isclose(stats.get("variance", -1), 2083750.0) + assert isclose(stats.get("skewness", -1), -2.598076367237896e-16) + assert isclose(stats.get("kurtosis", -1), -1.2000000960000043) + + def test_describe_subset_finite(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=False, insert_pos_inf=False), tmpdir + ) + + stats = f.fetch(count_type="float").describe(["nnz", "kurtosis"]) + assert len(stats) == 2 + assert stats.get("nnz", -1) == 5_000 + assert isclose(stats.get("kurtosis", -1), -1.2000000960000043) + + stats = f.fetch(count_type="float").describe(["max", "variance"]) + assert len(stats) == 2 + assert isclose(stats.get("max", -1), 4999.123) + assert isclose(stats.get("variance", -1), 2083750.0) + + def test_describe_all_with_nans(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=True, insert_neg_inf=False, insert_pos_inf=False), tmpdir + ) + + stats = f.fetch(count_type="float").describe(keep_nans=True, keep_infs=True) + + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("sum", -1)) + assert isnan(stats.get("min", -1)) + assert isnan(stats.get("max", -1)) + assert isnan(stats.get("mean", -1)) + assert isnan(stats.get("variance", -1)) + assert isnan(stats.get("skewness", -1)) + assert isnan(stats.get("kurtosis", -1)) + + def test_describe_subset_with_nans(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=True, insert_neg_inf=False, insert_pos_inf=False), tmpdir + ) + + stats = f.fetch(count_type="float").describe(["nnz", "kurtosis"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("kurtosis", -1)) + + stats = f.fetch(count_type="float").describe(["max", "variance"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert isnan(stats.get("max", -1)) + assert isnan(stats.get("variance", -1)) + + def test_describe_all_with_neg_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=True, insert_pos_inf=False), tmpdir + ) + + stats = f.fetch(count_type="float").describe(keep_nans=True, keep_infs=True) + + assert stats.get("nnz", -1) == 5_000 + assert stats.get("sum", -1) == -inf + assert stats.get("min", -1) == -inf + assert isclose(stats.get("max", -1), 4999.123) + assert stats.get("mean", -1) == -inf + assert isnan(stats.get("variance", -1)) + assert isnan(stats.get("skewness", -1)) + assert isnan(stats.get("kurtosis", -1)) + + def test_describe_subset_with_neg_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=True, insert_pos_inf=False), tmpdir + ) + + stats = f.fetch(count_type="float").describe(["nnz", "kurtosis"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("kurtosis", -1)) + + stats = f.fetch(count_type="float").describe(["max", "variance"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert isclose(stats.get("max", -1), 4999.123) + assert isnan(stats.get("variance", -1)) + + def test_describe_all_with_pos_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=False, insert_pos_inf=True), tmpdir + ) + + stats = f.fetch(count_type="float").describe(keep_nans=True, keep_infs=True) + + assert stats.get("nnz", -1) == 5_000 + assert stats.get("sum", -1) == inf + assert isclose(stats.get("min", -1), 0.123) + assert stats.get("max", -1) == inf + assert stats.get("mean", -1) == inf + assert isnan(stats.get("variance", -1)) + assert isnan(stats.get("skewness", -1)) + assert isnan(stats.get("kurtosis", -1)) + + def test_describe_subset_with_pos_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=False, insert_pos_inf=True), tmpdir + ) + + stats = f.fetch(count_type="float").describe(["nnz", "kurtosis"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("kurtosis", -1)) + + stats = f.fetch(count_type="float").describe(["max", "variance"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert stats.get("max", -1) == inf + assert isnan(stats.get("variance", -1)) + + def test_describe_all_with_neg_and_pos_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=True, insert_pos_inf=True), tmpdir + ) + + stats = f.fetch(count_type="float").describe(keep_nans=True, keep_infs=True) + + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("sum", -1)) + assert stats.get("min", -1) == -inf + assert stats.get("max", -1) == inf + assert isnan(stats.get("mean", -1)) + assert isnan(stats.get("variance", -1)) + assert isnan(stats.get("skewness", -1)) + assert isnan(stats.get("kurtosis", -1)) + + def test_describe_subset_with_neg_and_pos_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=False, insert_neg_inf=True, insert_pos_inf=True), tmpdir + ) + + stats = f.fetch(count_type="float").describe(["nnz", "kurtosis"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("kurtosis", -1)) + + stats = f.fetch(count_type="float").describe(["max", "variance"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert stats.get("max", -1) == inf + assert isnan(stats.get("variance", -1)) + + def test_describe_all_with_nan_and_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=True, insert_neg_inf=False, insert_pos_inf=True), tmpdir + ) + + stats = f.fetch(count_type="float").describe(keep_nans=True, keep_infs=True) + + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("sum", -1)) + assert isnan(stats.get("min", -1)) + assert isnan(stats.get("max", -1)) + assert isnan(stats.get("mean", -1)) + assert isnan(stats.get("variance", -1)) + assert isnan(stats.get("skewness", -1)) + assert isnan(stats.get("kurtosis", -1)) + + def test_describe_subset_with_nan_and_inf(self, tmpdir): + f = self.make_cooler_file( + *self.generate_pixels(insert_nan=True, insert_neg_inf=False, insert_pos_inf=True), tmpdir + ) + + stats = f.fetch(count_type="float").describe(["nnz", "kurtosis"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert stats.get("nnz", -1) == 5_000 + assert isnan(stats.get("kurtosis", -1)) + + stats = f.fetch(count_type="float").describe(["max", "variance"], keep_nans=True, keep_infs=True) + assert len(stats) == 2 + assert isnan(stats.get("max", -1)) + assert isnan(stats.get("variance", -1)) diff --git a/test/test_fetch_kurtosis.py b/test/test_fetch_kurtosis.py new file mode 100644 index 0000000..9413faf --- /dev/null +++ b/test/test_fetch_kurtosis.py @@ -0,0 +1,85 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib +from math import isnan + +import pytest + +import hictkpy + +testdir = pathlib.Path(__file__).resolve().parent + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +class TestClass: + def test_fetch_kurtosis(self, file, resolution): + f = hictkpy.File(file, resolution) + + def isclose(n1, n2) -> bool: + import math + + return math.isclose(n1, n2, rel_tol=1.0e-4) + + # venv/bin/python test/scripts/compute_stats_for_testing.py test/data/cooler_test_file.mcool 100000 --metrics kurtosis --range "" chr2R chr2L --range2 "" chr2R chr2R --normalization NONE VC weight + assert isclose(f.fetch().kurtosis(), 9102.714436703003) + assert isclose(f.fetch().kurtosis(keep_infs=True), 9102.714436703003) + assert isclose(f.fetch().kurtosis(keep_nans=True), 9102.714436703003) + assert isclose(f.fetch().kurtosis(keep_nans=True, keep_infs=True), 9102.714436703003) + + assert isclose(f.fetch("chr2R").kurtosis(), 373.3395734325773) + assert isclose(f.fetch("chr2R").kurtosis(keep_infs=True), 373.3395734325773) + assert isclose(f.fetch("chr2R").kurtosis(keep_nans=True), 373.3395734325773) + assert isclose(f.fetch("chr2R").kurtosis(keep_nans=True, keep_infs=True), 373.3395734325773) + + assert isclose(f.fetch("chr2L", "chr2R").kurtosis(), 3688.676195306675) + assert isclose(f.fetch("chr2L", "chr2R").kurtosis(keep_infs=True), 3688.676195306675) + assert isclose(f.fetch("chr2L", "chr2R").kurtosis(keep_nans=True), 3688.676195306675) + assert isclose(f.fetch("chr2L", "chr2R").kurtosis(keep_nans=True, keep_infs=True), 3688.676195306675) + + if "VC" in f.avail_normalizations(): + assert isclose(f.fetch(normalization="VC").kurtosis(), 231607.900103242) + assert isclose(f.fetch(normalization="VC").kurtosis(keep_nans=True), 231607.900103242) + assert isnan(f.fetch(normalization="VC").kurtosis(keep_infs=True)) + assert isnan(f.fetch(normalization="VC").kurtosis(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization="VC").kurtosis(), 969.1654059364474) + assert isclose(f.fetch("chr2R", normalization="VC").kurtosis(keep_infs=True), 969.1654059364474) + assert isclose(f.fetch("chr2R", normalization="VC").kurtosis(keep_nans=True), 969.1654059364474) + assert isclose( + f.fetch("chr2R", normalization="VC").kurtosis(keep_nans=True, keep_infs=True), 969.1654059364474 + ) + + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").kurtosis(), 12761.80816176265) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").kurtosis(keep_infs=True), 12761.80816176265) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").kurtosis(keep_nans=True), 12761.80816176265) + assert isclose( + f.fetch("chr2L", "chr2R", normalization="VC").kurtosis(keep_nans=True, keep_infs=True), + 12761.80816176265, + ) + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert isclose(f.fetch(normalization=norm).kurtosis(), 9402.779596517719) + assert isclose(f.fetch(normalization=norm).kurtosis(keep_infs=True), 9402.779596517719) + assert isnan(f.fetch(normalization=norm).kurtosis(keep_nans=True)) + assert isnan(f.fetch(normalization=norm).kurtosis(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization=norm).kurtosis(), 129.2767572383961) + assert isclose(f.fetch("chr2R", normalization=norm).kurtosis(keep_infs=True), 129.2767572383961) + assert isnan(f.fetch("chr2R", normalization=norm).kurtosis(keep_nans=True)) + assert isnan(f.fetch("chr2R", normalization=norm).kurtosis(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).kurtosis(), 577.4015771468706) + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).kurtosis(keep_infs=True), 577.4015771468706) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).kurtosis(keep_nans=True)) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).kurtosis(keep_nans=True, keep_infs=True)) diff --git a/test/test_fetch_max.py b/test/test_fetch_max.py new file mode 100644 index 0000000..cf651c0 --- /dev/null +++ b/test/test_fetch_max.py @@ -0,0 +1,88 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib +from math import isclose, isinf, isnan + +import pytest + +import hictkpy + +testdir = pathlib.Path(__file__).resolve().parent + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +class TestClass: + def test_fetch_max(self, file, resolution): + f = hictkpy.File(file, resolution) + + assert f.fetch().max() == 660_210 + assert f.fetch().max(keep_infs=True) == 660_210 + assert f.fetch().max(keep_nans=True) == 660_210 + assert f.fetch().max(keep_nans=True, keep_infs=True) == 660_210 + + assert f.fetch("chr2R").max() == 229_239 + assert f.fetch("chr2R").max(keep_infs=True) == 229_239 + assert f.fetch("chr2R").max(keep_nans=True) == 229_239 + assert f.fetch("chr2R").max(keep_nans=True, keep_infs=True) == 229_239 + + assert f.fetch("chr2L", "chr2R").max() == 3_051 + assert f.fetch("chr2L", "chr2R").max(keep_infs=True) == 3_051 + assert f.fetch("chr2L", "chr2R").max(keep_nans=True) == 3_051 + assert f.fetch("chr2L", "chr2R").max(keep_nans=True, keep_infs=True) == 3_051 + + if "VC" in f.avail_normalizations(): + assert isclose(f.fetch(normalization="VC").max(), 2929440.776073241) + assert isclose(f.fetch(normalization="VC").max(keep_nans=True), 2929440.776073241) + assert isinf(f.fetch(normalization="VC").max(keep_infs=True)) + assert isinf(f.fetch(normalization="VC").max(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization="VC").max(), 322150.9086956783) + assert isclose(f.fetch("chr2R", normalization="VC").max(keep_infs=True), 322150.9086956783) + assert isclose(f.fetch("chr2R", normalization="VC").max(keep_nans=True), 322150.9086956783) + assert isclose(f.fetch("chr2R", normalization="VC").max(keep_nans=True, keep_infs=True), 322150.9086956783) + + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").max(), 20736.04261680762) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").max(keep_infs=True), 20736.04261680762) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").max(keep_nans=True), 20736.04261680762) + assert isclose( + f.fetch("chr2L", "chr2R", normalization="VC").max(keep_nans=True, keep_infs=True), 20736.04261680762 + ) + + # Even though the test .mcool and .hic files are supposed to be identical, normalized interactions are expected to be very close, but not identical. + # This is due to differences in how balancing weights are stored Cooler and .hic files: + # - Cooler uses float64 multiplicative weights + # - .hic v8 and older use float64 divisive weights + # - .hic v9 uses float32 divisive weights + # Usually the differences are small enough to not being picked up by isclose. + # However, this is not the case for very small or very large values (like those returned by min() and max()) + def isclose_(n1, n2) -> bool: + import math + + return math.isclose(n1, n2, rel_tol=1.0e-6) + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert isclose_(f.fetch(normalization=norm).max(), 9.307500711565524) + assert isclose_(f.fetch(normalization=norm).max(keep_infs=True), 9.307500711565524) + assert isnan(f.fetch(normalization=norm).max(keep_nans=True)) + assert isnan(f.fetch(normalization=norm).max(keep_nans=True, keep_infs=True)) + + assert isclose_(f.fetch("chr2R", normalization=norm).max(), 1.386381144237653) + assert isclose_(f.fetch("chr2R", normalization=norm).max(keep_infs=True), 1.386381144237653) + assert isnan(f.fetch("chr2R", normalization=norm).max(keep_nans=True)) + assert isnan(f.fetch("chr2R", normalization=norm).max(keep_nans=True, keep_infs=True)) + + assert isclose_(f.fetch("chr2L", "chr2R", normalization=norm).max(), 0.02017488661930269) + assert isclose_(f.fetch("chr2L", "chr2R", normalization=norm).max(keep_infs=True), 0.02017488661930269) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).max(keep_nans=True)) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).max(keep_nans=True, keep_infs=True)) diff --git a/test/test_fetch_mean.py b/test/test_fetch_mean.py new file mode 100644 index 0000000..6c6d489 --- /dev/null +++ b/test/test_fetch_mean.py @@ -0,0 +1,77 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib +from math import isclose, isinf, isnan + +import pytest + +import hictkpy + +testdir = pathlib.Path(__file__).resolve().parent + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +class TestClass: + def test_fetch_mean(self, file, resolution): + f = hictkpy.File(file, resolution) + + # venv/bin/python test/scripts/compute_stats_for_testing.py test/data/cooler_test_file.mcool 100000 --metrics mean --range "" chr2R chr2L --range2 "" chr2R chr2R --normalization NONE VC weight + assert isclose(f.fetch().mean(), 133.8844959028913) + assert isclose(f.fetch().mean(keep_infs=True), 133.8844959028913) + assert isclose(f.fetch().mean(keep_nans=True), 133.8844959028913) + assert isclose(f.fetch().mean(keep_nans=True, keep_infs=True), 133.8844959028913) + + assert isclose(f.fetch("chr2R").mean(), 674.4195611285267) + assert isclose(f.fetch("chr2R").mean(keep_infs=True), 674.4195611285267) + assert isclose(f.fetch("chr2R").mean(keep_nans=True), 674.4195611285267) + assert isclose(f.fetch("chr2R").mean(keep_nans=True, keep_infs=True), 674.4195611285267) + + assert isclose(f.fetch("chr2L", "chr2R").mean(), 25.08477098978418) + assert isclose(f.fetch("chr2L", "chr2R").mean(keep_infs=True), 25.08477098978418) + assert isclose(f.fetch("chr2L", "chr2R").mean(keep_nans=True), 25.08477098978418) + assert isclose(f.fetch("chr2L", "chr2R").mean(keep_nans=True, keep_infs=True), 25.08477098978418) + + if "VC" in f.avail_normalizations(): + assert isclose(f.fetch(normalization="VC").mean(), 133.3923248348463) + assert isclose(f.fetch(normalization="VC").mean(keep_nans=True), 133.3923248348463) + assert isinf(f.fetch(normalization="VC").mean(keep_infs=True)) + assert isinf(f.fetch(normalization="VC").mean(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization="VC").mean(), 657.38617107827) + assert isclose(f.fetch("chr2R", normalization="VC").mean(keep_infs=True), 657.38617107827) + assert isclose(f.fetch("chr2R", normalization="VC").mean(keep_nans=True), 657.38617107827) + assert isclose(f.fetch("chr2R", normalization="VC").mean(keep_nans=True, keep_infs=True), 657.38617107827) + + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").mean(), 29.20444691494886) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").mean(keep_infs=True), 29.20444691494886) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").mean(keep_nans=True), 29.20444691494886) + assert isclose( + f.fetch("chr2L", "chr2R", normalization="VC").mean(keep_nans=True, keep_infs=True), 29.20444691494886 + ) + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert isclose(f.fetch(normalization=norm).mean(), 0.002192326655195654) + assert isclose(f.fetch(normalization=norm).mean(keep_infs=True), 0.002192326655195654) + assert isnan(f.fetch(normalization=norm).mean(keep_nans=True)) + assert isnan(f.fetch(normalization=norm).mean(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization=norm).mean(), 0.0105224497942104) + assert isclose(f.fetch("chr2R", normalization=norm).mean(keep_infs=True), 0.0105224497942104) + assert isnan(f.fetch("chr2R", normalization=norm).mean(keep_nans=True)) + assert isnan(f.fetch("chr2R", normalization=norm).mean(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).mean(), 0.0003800524166065285) + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).mean(keep_infs=True), 0.0003800524166065285) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).mean(keep_nans=True)) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).mean(keep_nans=True, keep_infs=True)) diff --git a/test/test_fetch_min.py b/test/test_fetch_min.py new file mode 100644 index 0000000..7deb74f --- /dev/null +++ b/test/test_fetch_min.py @@ -0,0 +1,90 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib +from math import isclose, isnan + +import pytest + +import hictkpy + +testdir = pathlib.Path(__file__).resolve().parent + + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +class TestClass: + def test_fetch_min(self, file, resolution): + f = hictkpy.File(file, resolution) + + # venv/bin/python test/scripts/compute_stats_for_testing.py test/data/cooler_test_file.mcool 100000 --metrics min --range "" chr2R chr2L --range2 "" chr2R chr2R --normalization NONE VC weight + assert f.fetch().min() == 1 + assert f.fetch().min(keep_infs=True) == 1 + assert f.fetch().min(keep_nans=True) == 1 + assert f.fetch().min(keep_nans=True, keep_infs=True) == 1 + + assert f.fetch("chr2R").min() == 1 + assert f.fetch("chr2R").min(keep_infs=True) == 1 + assert f.fetch("chr2R").min(keep_nans=True) == 1 + assert f.fetch("chr2R").min(keep_nans=True, keep_infs=True) == 1 + + assert f.fetch("chr2L", "chr2R").min() == 1 + assert f.fetch("chr2L", "chr2R").min(keep_infs=True) == 1 + assert f.fetch("chr2L", "chr2R").min(keep_nans=True) == 1 + assert f.fetch("chr2L", "chr2R").min(keep_nans=True, keep_infs=True) == 1 + + if "VC" in f.avail_normalizations(): + assert isclose(f.fetch(normalization="VC").min(), 0.006248577402113111) + assert isclose(f.fetch(normalization="VC").min(keep_infs=True), 0.006248577402113111) + assert isclose(f.fetch(normalization="VC").min(keep_nans=True), 0.006248577402113111) + assert isclose(f.fetch(normalization="VC").min(keep_nans=True, keep_infs=True), 0.006248577402113111) + + assert isclose(f.fetch("chr2R", normalization="VC").min(), 3.024889833701339) + assert isclose(f.fetch("chr2R", normalization="VC").min(keep_infs=True), 3.024889833701339) + assert isclose(f.fetch("chr2R", normalization="VC").min(keep_nans=True), 3.024889833701339) + assert isclose(f.fetch("chr2R", normalization="VC").min(keep_nans=True, keep_infs=True), 3.024889833701339) + + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").min(), 1.179576020123557) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").min(keep_infs=True), 1.179576020123557) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").min(keep_nans=True), 1.179576020123557) + assert isclose( + f.fetch("chr2L", "chr2R", normalization="VC").min(keep_nans=True, keep_infs=True), 1.179576020123557 + ) + + # Even though the test .mcool and .hic files are supposed to be identical, normalized interactions are expected to be very close, but not identical. + # This is due to differences in how balancing weights are stored Cooler and .hic files: + # - Cooler uses float64 multiplicative weights + # - .hic v8 and older use float64 divisive weights + # - .hic v9 uses float32 divisive weights + # Usually the differences are small enough to not being picked up by isclose. + # However, this is not the case for very small or very large values (like those returned by min() and max()) + def isclose_(n1, n2) -> bool: + import math + + return math.isclose(n1, n2, rel_tol=1.0e-6) + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert isclose_(f.fetch(normalization=norm).min(), 1.514271829598328e-05) + assert isclose_(f.fetch(normalization=norm).min(keep_infs=True), 1.514271829598328e-05) + assert isnan(f.fetch(normalization=norm).min(keep_nans=True)) + assert isnan(f.fetch(normalization=norm).min(keep_nans=True, keep_infs=True)) + + assert isclose_(f.fetch("chr2R", normalization=norm).min(), 3.324204473054833e-05) + assert isclose_(f.fetch("chr2R", normalization=norm).min(keep_infs=True), 3.324204473054833e-05) + assert isnan(f.fetch("chr2R", normalization=norm).min(keep_nans=True)) + assert isnan(f.fetch("chr2R", normalization=norm).min(keep_nans=True, keep_infs=True)) + + assert isclose_(f.fetch("chr2L", "chr2R", normalization=norm).min(), 2.117120750101533e-05) + assert isclose_(f.fetch("chr2L", "chr2R", normalization=norm).min(keep_infs=True), 2.117120750101533e-05) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).min(keep_nans=True)) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).min(keep_nans=True, keep_infs=True)) diff --git a/test/test_fetch_nnz.py b/test/test_fetch_nnz.py index 0cf0fea..1db2201 100644 --- a/test/test_fetch_nnz.py +++ b/test/test_fetch_nnz.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023 Roberto Rossini +# Copyright (C) 2024 Roberto Rossini # # SPDX-License-Identifier: MIT @@ -22,6 +22,53 @@ class TestClass: def test_fetch_nnz(self, file, resolution): f = hictkpy.File(file, resolution) + + # venv/bin/python test/scripts/compute_stats_for_testing.py test/data/cooler_test_file.mcool 100000 --metrics nnz --range "" chr2R chr2L --range2 "" chr2R chr2R --normalization NONE VC weight assert f.fetch().nnz() == 890_384 + assert f.fetch().nnz(keep_infs=True) == 890_384 + assert f.fetch().nnz(keep_nans=True) == 890_384 + assert f.fetch().nnz(keep_nans=True, keep_infs=True) == 890_384 + assert f.fetch("chr2R").nnz() == 31_900 + assert f.fetch("chr2R").nnz(keep_infs=True) == 31_900 + assert f.fetch("chr2R").nnz(keep_nans=True) == 31_900 + assert f.fetch("chr2R").nnz(keep_nans=True, keep_infs=True) == 31_900 + assert f.fetch("chr2L", "chr2R").nnz() == 59_124 + assert f.fetch("chr2L", "chr2R").nnz(keep_infs=True) == 59_124 + assert f.fetch("chr2L", "chr2R").nnz(keep_nans=True) == 59_124 + assert f.fetch("chr2L", "chr2R").nnz(keep_nans=True, keep_infs=True) == 59_124 + + if "VC" in f.avail_normalizations(): + assert f.fetch(normalization="VC").nnz() == 887_644 + assert f.fetch(normalization="VC").nnz(keep_infs=True) == 890_384 + assert f.fetch(normalization="VC").nnz(keep_nans=True) == 887_644 + assert f.fetch(normalization="VC").nnz(keep_nans=True, keep_infs=True) == 890_384 + + assert f.fetch("chr2R", normalization="VC").nnz() == 31_900 + assert f.fetch("chr2R", normalization="VC").nnz(keep_infs=True) == 31_900 + assert f.fetch("chr2R", normalization="VC").nnz(keep_nans=True) == 31_900 + assert f.fetch("chr2R", normalization="VC").nnz(keep_nans=True, keep_infs=True) == 31_900 + + assert f.fetch("chr2L", "chr2R", normalization="VC").nnz() == 59_124 + assert f.fetch("chr2L", "chr2R", normalization="VC").nnz(keep_infs=True) == 59_124 + assert f.fetch("chr2L", "chr2R", normalization="VC").nnz(keep_nans=True) == 59_124 + assert f.fetch("chr2L", "chr2R", normalization="VC").nnz(keep_nans=True, keep_infs=True) == 59_124 + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert f.fetch(normalization=norm).nnz() == 834_993 + assert f.fetch(normalization=norm).nnz(keep_infs=True) == 834_993 + assert f.fetch(normalization=norm).nnz(keep_nans=True) == 890_384 + assert f.fetch(normalization=norm).nnz(keep_nans=True, keep_infs=True) == 890_384 + + assert f.fetch("chr2R", normalization=norm).nnz() == 28_919 + assert f.fetch("chr2R", normalization=norm).nnz(keep_infs=True) == 28_919 + assert f.fetch("chr2R", normalization=norm).nnz(keep_nans=True) == 31_900 + assert f.fetch("chr2R", normalization=norm).nnz(keep_nans=True, keep_infs=True) == 31_900 + + assert f.fetch("chr2L", "chr2R", normalization=norm).nnz() == 55_432 + assert f.fetch("chr2L", "chr2R", normalization=norm).nnz(keep_infs=True) == 55_432 + assert f.fetch("chr2L", "chr2R", normalization=norm).nnz(keep_nans=True) == 59_124 + assert f.fetch("chr2L", "chr2R", normalization=norm).nnz(keep_nans=True, keep_infs=True) == 59_124 diff --git a/test/test_fetch_skewness.py b/test/test_fetch_skewness.py new file mode 100644 index 0000000..3f8a2e0 --- /dev/null +++ b/test/test_fetch_skewness.py @@ -0,0 +1,85 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib +from math import isnan + +import pytest + +import hictkpy + +testdir = pathlib.Path(__file__).resolve().parent + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +class TestClass: + def test_fetch_skewness(self, file, resolution): + f = hictkpy.File(file, resolution) + + def isclose(n1, n2) -> bool: + import math + + return math.isclose(n1, n2, rel_tol=1.0e-4) + + # venv/bin/python test/scripts/compute_stats_for_testing.py test/data/cooler_test_file.mcool 100000 --metrics skewness --range "" chr2R chr2L --range2 "" chr2R chr2R --normalization NONE VC weight + assert isclose(f.fetch().skewness(), 59.84043704958881) + assert isclose(f.fetch().skewness(keep_infs=True), 59.84043704958881) + assert isclose(f.fetch().skewness(keep_nans=True), 59.84043704958881) + assert isclose(f.fetch().skewness(keep_nans=True, keep_infs=True), 59.84043704958881) + + assert isclose(f.fetch("chr2R").skewness(), 16.74745540670616) + assert isclose(f.fetch("chr2R").skewness(keep_infs=True), 16.74745540670616) + assert isclose(f.fetch("chr2R").skewness(keep_nans=True), 16.74745540670616) + assert isclose(f.fetch("chr2R").skewness(keep_nans=True, keep_infs=True), 16.74745540670616) + + assert isclose(f.fetch("chr2L", "chr2R").skewness(), 39.00616966835411) + assert isclose(f.fetch("chr2L", "chr2R").skewness(keep_infs=True), 39.00616966835411) + assert isclose(f.fetch("chr2L", "chr2R").skewness(keep_nans=True), 39.00616966835411) + assert isclose(f.fetch("chr2L", "chr2R").skewness(keep_nans=True, keep_infs=True), 39.00616966835411) + + if "VC" in f.avail_normalizations(): + assert isclose(f.fetch(normalization="VC").skewness(), 383.0994807473643) + assert isclose(f.fetch(normalization="VC").skewness(keep_nans=True), 383.0994807473643) + assert isnan(f.fetch(normalization="VC").skewness(keep_infs=True)) + assert isnan(f.fetch(normalization="VC").skewness(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization="VC").skewness(), 26.3819375118102) + assert isclose(f.fetch("chr2R", normalization="VC").skewness(keep_infs=True), 26.3819375118102) + assert isclose(f.fetch("chr2R", normalization="VC").skewness(keep_nans=True), 26.3819375118102) + assert isclose( + f.fetch("chr2R", normalization="VC").skewness(keep_nans=True, keep_infs=True), 26.3819375118102 + ) + + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").skewness(), 91.43201554494111) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").skewness(keep_infs=True), 91.43201554494111) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").skewness(keep_nans=True), 91.43201554494111) + assert isclose( + f.fetch("chr2L", "chr2R", normalization="VC").skewness(keep_nans=True, keep_infs=True), + 91.43201554494111, + ) + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert isclose(f.fetch(normalization=norm).skewness(), 52.08028528233407) + assert isclose(f.fetch(normalization=norm).skewness(keep_infs=True), 52.08028528233407) + assert isnan(f.fetch(normalization=norm).skewness(keep_nans=True)) + assert isnan(f.fetch(normalization=norm).skewness(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization=norm).skewness(), 10.84820776645202) + assert isclose(f.fetch("chr2R", normalization=norm).skewness(keep_infs=True), 10.84820776645202) + assert isnan(f.fetch("chr2R", normalization=norm).skewness(keep_nans=True)) + assert isnan(f.fetch("chr2R", normalization=norm).skewness(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).skewness(), 16.53465790282013) + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).skewness(keep_infs=True), 16.53465790282013) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).skewness(keep_nans=True)) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).skewness(keep_nans=True, keep_infs=True)) diff --git a/test/test_fetch_sum.py b/test/test_fetch_sum.py index 459cb3e..462b67b 100644 --- a/test/test_fetch_sum.py +++ b/test/test_fetch_sum.py @@ -1,8 +1,9 @@ -# Copyright (C) 2023 Roberto Rossini +# Copyright (C) 2024 Roberto Rossini # # SPDX-License-Identifier: MIT import pathlib +from math import isclose, isinf, isnan import pytest @@ -23,6 +24,55 @@ class TestClass: def test_fetch_sum(self, file, resolution): f = hictkpy.File(file, resolution) + + # venv/bin/python test/scripts/compute_stats_for_testing.py test/data/cooler_test_file.mcool 100000 --metrics sum --range "" chr2R chr2L --range2 "" chr2R chr2R --normalization NONE VC weight assert f.fetch().sum() == 119_208_613 - assert f.fetch("chr2L").sum() == 19_968_156 + assert f.fetch().sum(keep_infs=True) == 119_208_613 + assert f.fetch().sum(keep_nans=True) == 119_208_613 + assert f.fetch().sum(keep_nans=True, keep_infs=True) == 119_208_613 + + assert f.fetch("chr2R").sum() == 21_513_984 + assert f.fetch("chr2R").sum(keep_infs=True) == 21_513_984 + assert f.fetch("chr2R").sum(keep_nans=True) == 21_513_984 + assert f.fetch("chr2R").sum(keep_nans=True, keep_infs=True) == 21_513_984 + assert f.fetch("chr2L", "chr2R").sum() == 1_483_112 + assert f.fetch("chr2L", "chr2R").sum(keep_infs=True) == 1_483_112 + assert f.fetch("chr2L", "chr2R").sum(keep_nans=True) == 1_483_112 + assert f.fetch("chr2L", "chr2R").sum(keep_nans=True, keep_infs=True) == 1_483_112 + + if "VC" in f.avail_normalizations(): + assert isclose(f.fetch(normalization="VC").sum(), 118404896.7857023) + assert isclose(f.fetch(normalization="VC").sum(keep_nans=True), 118404896.7857023) + assert isinf(f.fetch(normalization="VC").sum(keep_infs=True)) + assert isinf(f.fetch(normalization="VC").sum(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization="VC").sum(), 20970618.85739681) + assert isclose(f.fetch("chr2R", normalization="VC").sum(keep_infs=True), 20970618.85739681) + assert isclose(f.fetch("chr2R", normalization="VC").sum(keep_nans=True), 20970618.85739681) + assert isclose(f.fetch("chr2R", normalization="VC").sum(keep_nans=True, keep_infs=True), 20970618.85739681) + + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").sum(), 1726683.719399437) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").sum(keep_infs=True), 1726683.719399437) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").sum(keep_nans=True), 1726683.719399437) + assert isclose( + f.fetch("chr2L", "chr2R", normalization="VC").sum(keep_nans=True, keep_infs=True), 1726683.719399437 + ) + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert isclose(f.fetch(normalization=norm).sum(), 1830.577410801785) + assert isclose(f.fetch(normalization=norm).sum(keep_infs=True), 1830.577410801785) + assert isnan(f.fetch(normalization=norm).sum(keep_nans=True)) + assert isnan(f.fetch(normalization=norm).sum(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization=norm).sum(), 304.2987255987707) + assert isclose(f.fetch("chr2R", normalization=norm).sum(keep_infs=True), 304.2987255987707) + assert isnan(f.fetch("chr2R", normalization=norm).sum(keep_nans=True)) + assert isnan(f.fetch("chr2R", normalization=norm).sum(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).sum(), 21.06706555733309) + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).sum(keep_infs=True), 21.06706555733309) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).sum(keep_nans=True)) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).sum(keep_nans=True, keep_infs=True)) diff --git a/test/test_fetch_variance.py b/test/test_fetch_variance.py new file mode 100644 index 0000000..81401ef --- /dev/null +++ b/test/test_fetch_variance.py @@ -0,0 +1,84 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib +from math import isnan + +import pytest + +import hictkpy + +testdir = pathlib.Path(__file__).resolve().parent + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +class TestClass: + def test_fetch_variance(self, file, resolution): + f = hictkpy.File(file, resolution) + + def isclose(n1, n2) -> bool: + import math + + return math.isclose(n1, n2, rel_tol=1.0e-4) + + # venv/bin/python test/scripts/compute_stats_for_testing.py test/data/cooler_test_file.mcool 100000 --metrics variance --range "" chr2R chr2L --range2 "" chr2R chr2R --normalization NONE VC weight + assert isclose(f.fetch().variance(), 5373287.890326086) + assert isclose(f.fetch().variance(keep_infs=True), 5373287.890326086) + assert isclose(f.fetch().variance(keep_nans=True), 5373287.890326086) + assert isclose(f.fetch().variance(keep_nans=True, keep_infs=True), 5373287.890326086) + + assert isclose(f.fetch("chr2R").variance(), 30250660.97409304) + assert isclose(f.fetch("chr2R").variance(keep_infs=True), 30250660.97409304) + assert isclose(f.fetch("chr2R").variance(keep_nans=True), 30250660.97409304) + assert isclose(f.fetch("chr2R").variance(keep_nans=True, keep_infs=True), 30250660.97409304) + + assert isclose(f.fetch("chr2L", "chr2R").variance(), 775.8613589939481) + assert isclose(f.fetch("chr2L", "chr2R").variance(keep_infs=True), 775.8613589939481) + assert isclose(f.fetch("chr2L", "chr2R").variance(keep_nans=True), 775.8613589939481) + assert isclose(f.fetch("chr2L", "chr2R").variance(keep_nans=True, keep_infs=True), 775.8613589939481) + + if "VC" in f.avail_normalizations(): + assert isclose(f.fetch(normalization="VC").variance(), 19247119.46018428) + assert isclose(f.fetch(normalization="VC").variance(keep_nans=True), 19247119.46018428) + assert isnan(f.fetch(normalization="VC").variance(keep_infs=True)) + assert isnan(f.fetch(normalization="VC").variance(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization="VC").variance(), 37984857.23777649) + assert isclose(f.fetch("chr2R", normalization="VC").variance(keep_infs=True), 37984857.23777649) + assert isclose(f.fetch("chr2R", normalization="VC").variance(keep_nans=True), 37984857.23777649) + assert isclose( + f.fetch("chr2R", normalization="VC").variance(keep_nans=True, keep_infs=True), 37984857.23777649 + ) + + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").variance(), 16780.3290659542) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").variance(keep_infs=True), 16780.3290659542) + assert isclose(f.fetch("chr2L", "chr2R", normalization="VC").variance(keep_nans=True), 16780.3290659542) + assert isclose( + f.fetch("chr2L", "chr2R", normalization="VC").variance(keep_nans=True, keep_infs=True), 16780.3290659542 + ) + + norm = "weight" if f.is_cooler() else "ICE" + assert norm in f.avail_normalizations() + + assert isclose(f.fetch(normalization=norm).variance(), 0.001012475106404277) + assert isclose(f.fetch(normalization=norm).variance(keep_infs=True), 0.001012475106404277) + assert isnan(f.fetch(normalization=norm).variance(keep_nans=True)) + assert isnan(f.fetch(normalization=norm).variance(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2R", normalization=norm).variance(), 0.005035086513187063) + assert isclose(f.fetch("chr2R", normalization=norm).variance(keep_infs=True), 0.005035086513187063) + assert isnan(f.fetch("chr2R", normalization=norm).variance(keep_nans=True)) + assert isnan(f.fetch("chr2R", normalization=norm).variance(keep_nans=True, keep_infs=True)) + + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).variance(), 1.026472030208059e-07) + assert isclose(f.fetch("chr2L", "chr2R", normalization=norm).variance(keep_infs=True), 1.026472030208059e-07) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).variance(keep_nans=True)) + assert isnan(f.fetch("chr2L", "chr2R", normalization=norm).variance(keep_nans=True, keep_infs=True)) From 13a1f4bf1df5a9be5bdee3a16615a5310fe85ad1 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Nov 2024 21:40:15 +0100 Subject: [PATCH 3/6] Add script used to help generate the test cases for describe() --- test/scripts/compute_stats_for_testing.py | 163 ++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100755 test/scripts/compute_stats_for_testing.py diff --git a/test/scripts/compute_stats_for_testing.py b/test/scripts/compute_stats_for_testing.py new file mode 100755 index 0000000..1850587 --- /dev/null +++ b/test/scripts/compute_stats_for_testing.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import argparse +import pathlib +import warnings +from typing import Any, Dict, List, Sequence + +import numpy as np +import scipy.stats as ss + +import hictkpy + + +def make_cli() -> argparse.ArgumentParser: + cli = argparse.ArgumentParser() + + cli.add_argument( + "uri", + type=pathlib.Path, + help="Path to a .cool, .mcool or hic file.", + ) + cli.add_argument( + "resolution", + type=int, + help="Resolution to be used.", + ) + cli.add_argument( + "--metrics", + type=str, + choices={"nnz", "sum", "min", "max", "mean", "variance", "skewness", "kurtosis"}, + help="One or more stats to be computed.", + ) + cli.add_argument("--range", nargs="+", type=str, help="Coordinates of the genomic regions to be processed") + cli.add_argument("--range2", nargs="+", type=str, help="Coordinates of the genomic regions to be processed") + cli.add_argument( + "--normalization", nargs="+", type=str, help="Balance interactions using the given normalization method." + ) + + return cli + + +def format_command( + range1: str | None, + range2: str | None, + normalization: str | None, + keep_nans: bool, + keep_infs: bool, + metric: str, + value: int | float, +) -> str: + if np.isnan(value): + prefix = "assert isnan(f.fetch(" + suffix = ")" + elif np.isinf(value): + prefix = "assert isinf(f.fetch(" + suffix = ")" + elif isinstance(value, float): + prefix = "assert isclose(f.fetch(" + suffix = f", {value:.16g})" + else: + prefix = "assert f.fetch(" + suffix = f" == {value:_}" + + args = [] + if range1: + args.append(f'"{range1}"') + if range2 and range1 != range2: + args.append(f'"{range2}"') + if normalization is not None and normalization != "NONE": + args.append(f'normalization="{normalization}"') + + msg = prefix + ", ".join(args) + msg += f").{metric}(" + args = [] + if keep_nans: + args.append("keep_nans=True") + if keep_infs: + args.append("keep_infs=True") + msg += f"{", ".join(args)})" + msg += suffix + + return msg + + +def format_stats( + range1: str, + range2: str, + normalization: str, + stats: Dict[str, Any], + keep_nans: bool = False, + keep_infs: bool = False, +) -> List[str]: + messages = [] + for metric, value in stats.items(): + messages.append(format_command(range1, range2, normalization, keep_nans, keep_infs, metric, value)) + return messages + + +def compute_stats(counts: Sequence[int] | Sequence[float], metrics: List[str]) -> Dict[str, Any]: + with warnings.catch_warnings(action="ignore"): + stats = ss.describe(counts, nan_policy="propagate") + stats = { + "nnz": stats.nobs, + "sum": sum(counts), + "min": stats.minmax[0], + "max": stats.minmax[1], + "mean": stats.mean, + "variance": stats.variance, + "skewness": stats.skewness, + "kurtosis": stats.kurtosis, + } + + return {metric: value for metric, value in stats.items() if metric in metrics} + + +def process_param_combination(f: hictkpy.File, range: str, range2: str, normalization: str, metrics: List[str]): + counts = f.fetch(range, range2, normalization).to_df()["count"] + + raw = compute_stats(counts, metrics) + keep_nans = compute_stats(counts[~np.isinf(counts)], metrics) + keep_infs = compute_stats(counts[~np.isnan(counts)], metrics) + finite = compute_stats(counts[np.isfinite(counts)], metrics) + + messages = format_stats(range, range2, normalization, finite) + messages.extend(format_stats(range, range2, normalization, keep_nans, keep_nans=True)) + messages.extend(format_stats(range, range2, normalization, keep_infs, keep_infs=True)) + messages.extend(format_stats(range, range2, normalization, raw, keep_nans=True, keep_infs=True)) + + print("\n".join(sorted(messages))) + + +def main(): + args = vars(make_cli().parse_args()) + + if args["metrics"] is None: + args["metrics"] = ["nnz", "sum", "min", "max", "mean", "variance", "skewness", "kurtosis"] + + if args["range"] is None: + args["range"] = [""] + + if args["range2"] is None: + args["range2"] = [""] + + if len(args["range"]) != len(args["range2"]): + raise RuntimeError("please provide the same number of arguments for --range and --range2") + + if args["normalization"] is None: + args["normalization"] = ["NONE"] + + f = hictkpy.File(args["uri"], args["resolution"]) + + for norm in args["normalization"]: + for range1, range2 in zip(args["range"], args["range2"]): + process_param_combination(f, range1, range2, norm, args["metrics"]) + print() + + +if __name__ == "__main__": + main() From fe8aff3ae17d5127c9f18f68381c4ce8eb63d8b8 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Nov 2024 22:00:20 +0100 Subject: [PATCH 4/6] Update docs --- docs/api/generic.rst | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/docs/api/generic.rst b/docs/api/generic.rst index f39bee4..007d15f 100644 --- a/docs/api/generic.rst +++ b/docs/api/generic.rst @@ -45,8 +45,6 @@ Generic API .. automethod:: coord1 .. automethod:: coord2 - .. automethod:: nnz - .. automethod:: sum .. automethod:: to_arrow .. automethod:: to_coo .. automethod:: to_csr @@ -54,6 +52,40 @@ Generic API .. automethod:: to_numpy .. automethod:: to_pandas + **Statistics** + + :py:class:`hictkpy.PixelSelector` exposes several methods to compute or estimate several statistics efficiently. + + The main features of these methods are: + + * All statistics are computed by traversing the data only once and without caching interactions. + * All methods can be tweaked to include or exclude non-finite values. + * All functions implemented using short-circuiting to detect scenarios where the required statistics can be computed without traversing all pixels. + + The following statistics are guaranteed to be exact: + + * nnz + * sum + * min + * max + * mean + + The rest of the supported statistics (currently variance, skewness, and kurtosis) are estimated and are thus not guaranteed to be exact. + However, in practice, the estimation is usually very accurate (based on my tests, the rel. error is always < 1.0e-4, and typically < 1.0e-6). + The only case where the estimates are likely to be inaccurate is when the sample size (i.e. the number of non-zero pixels overlapping a query) is very small. + + .. automethod:: describe + .. automethod:: kurtosis + .. automethod:: max + .. automethod:: mean + .. automethod:: min + .. automethod:: nnz + .. automethod:: skewness + .. automethod:: sum + .. automethod:: variance + + **Iteration** + .. automethod:: __iter__ .. code-block:: ipythonconsole From 6e003dfb6681ffbc567463e84006657d9308e576 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Nov 2024 22:30:34 +0100 Subject: [PATCH 5/6] Refactor --- src/include/hictkpy/pixel_aggregator.hpp | 17 +++ src/pixel_aggregator_impl.hpp | 159 ++++++++++++++--------- 2 files changed, 115 insertions(+), 61 deletions(-) diff --git a/src/include/hictkpy/pixel_aggregator.hpp b/src/include/hictkpy/pixel_aggregator.hpp index 1054e92..d536b8f 100644 --- a/src/include/hictkpy/pixel_aggregator.hpp +++ b/src/include/hictkpy/pixel_aggregator.hpp @@ -107,6 +107,23 @@ class PixelAggregator { template [[nodiscard]] Stats extract(const phmap::flat_hash_set& metrics); + + template + [[nodiscard]] static std::int64_t extract_nnz(const Accumulator& accumulator); + template + [[nodiscard]] N extract_sum(const Accumulator& accumulator) const; + template + [[nodiscard]] N extract_min(const Accumulator& accumulator) const; + template + [[nodiscard]] N extract_max(const Accumulator& accumulator) const; + template + [[nodiscard]] double extract_mean(const Accumulator& accumulator) const; + template + [[nodiscard]] double extract_variance(const Accumulator& accumulator) const; + template + [[nodiscard]] double extract_skewness(const Accumulator& accumulator) const; + template + [[nodiscard]] double extract_kurtosis(const Accumulator& accumulator) const; }; } // namespace hictkpy diff --git a/src/pixel_aggregator_impl.hpp b/src/pixel_aggregator_impl.hpp index 793f5ce..8b37e1a 100644 --- a/src/pixel_aggregator_impl.hpp +++ b/src/pixel_aggregator_impl.hpp @@ -313,88 +313,37 @@ inline Stats PixelAggregator::extract(const phmap::flat_hash_set& m Stats stats{}; std::visit( - [&](auto& accumulator) { + [&](const auto& accumulator) { if (metrics.contains("nnz")) { - stats.nnz = boost::accumulators::count(accumulator); + stats.nnz = extract_nnz(accumulator); } if (metrics.contains("sum")) { - if constexpr (std::is_floating_point_v) { - if (_nan_found || (_neg_inf_found && _pos_inf_found)) { - stats.sum = std::numeric_limits::quiet_NaN(); - } else if (_pos_inf_found) { - stats.sum = std::numeric_limits::infinity(); - } else if (_neg_inf_found) { - stats.sum = -std::numeric_limits::infinity(); - } - } - if (!stats.sum.has_value()) { - stats.sum = boost::accumulators::sum(accumulator); - } + stats.sum = extract_sum(accumulator); } if (metrics.contains("min")) { - if constexpr (std::is_floating_point_v) { - if (_nan_found) { - stats.min = std::numeric_limits::quiet_NaN(); - } else if (_neg_inf_found) { - stats.min = -std::numeric_limits::infinity(); - } - } - if (!stats.min.has_value()) { - stats.min = boost::accumulators::min(accumulator); - } + stats.min = extract_min(accumulator); } if (metrics.contains("max")) { - if constexpr (std::is_floating_point_v) { - if (_nan_found) { - stats.max = std::numeric_limits::quiet_NaN(); - } else if (_pos_inf_found) { - stats.max = std::numeric_limits::infinity(); - } - } - if (!stats.max.has_value()) { - stats.max = boost::accumulators::max(accumulator); - } + stats.max = extract_max(accumulator); } if (metrics.contains("mean")) { - if (_nan_found || (_neg_inf_found && _pos_inf_found)) { - stats.mean = std::numeric_limits::quiet_NaN(); - } else if (_pos_inf_found) { - stats.mean = std::numeric_limits::infinity(); - } else if (_neg_inf_found) { - stats.mean = -std::numeric_limits::infinity(); - } else { - stats.mean = boost::accumulators::mean(accumulator); - } + stats.mean = extract_mean(accumulator); } if (metrics.contains("variance")) { - if (_nan_found || _pos_inf_found || _neg_inf_found) { - stats.variance = std::numeric_limits::quiet_NaN(); - } else { - stats.variance = boost::accumulators::variance(accumulator); - } + stats.variance = extract_variance(accumulator); } if (metrics.contains("skewness")) { - if (_nan_found || _pos_inf_found || _neg_inf_found) { - stats.skewness = std::numeric_limits::quiet_NaN(); - } else { - stats.skewness = boost::accumulators::skewness(accumulator); - } + stats.skewness = extract_skewness(accumulator); } if (metrics.contains("kurtosis")) { - if (_nan_found || _pos_inf_found || _neg_inf_found) { - stats.kurtosis = std::numeric_limits::quiet_NaN(); - } else if (_kurtosis_accumulator.has_value()) { - stats.kurtosis = boost::accumulators::kurtosis(*_kurtosis_accumulator); - } else { - stats.kurtosis = boost::accumulators::kurtosis(accumulator); - } + stats.kurtosis = extract_kurtosis(accumulator); } }, _accumulator); @@ -402,4 +351,92 @@ inline Stats PixelAggregator::extract(const phmap::flat_hash_set& m return stats; } -}; // namespace hictkpy +template +inline std::int64_t PixelAggregator::extract_nnz(const Accumulator& accumulator) { + return static_cast(boost::accumulators::count(accumulator)); +} + +template +inline N PixelAggregator::extract_sum(const Accumulator& accumulator) const { + if constexpr (std::is_floating_point_v) { + if (_nan_found || (_neg_inf_found && _pos_inf_found)) { + return std::numeric_limits::quiet_NaN(); + } + if (_pos_inf_found) { + return std::numeric_limits::infinity(); + } + if (_neg_inf_found) { + return -std::numeric_limits::infinity(); + } + } + return boost::accumulators::sum(accumulator); +} + +template +inline N PixelAggregator::extract_min(const Accumulator& accumulator) const { + if constexpr (std::is_floating_point_v) { + if (_nan_found) { + return std::numeric_limits::quiet_NaN(); + } + if (_neg_inf_found) { + return -std::numeric_limits::infinity(); + } + } + return boost::accumulators::min(accumulator); +} + +template +inline N PixelAggregator::extract_max(const Accumulator& accumulator) const { + if constexpr (std::is_floating_point_v) { + if (_nan_found) { + return std::numeric_limits::quiet_NaN(); + } + if (_pos_inf_found) { + return std::numeric_limits::infinity(); + } + } + return boost::accumulators::max(accumulator); +} + +template +inline double PixelAggregator::extract_mean(const Accumulator& accumulator) const { + if (_nan_found || (_neg_inf_found && _pos_inf_found)) { + return std::numeric_limits::quiet_NaN(); + } + if (_pos_inf_found) { + return std::numeric_limits::infinity(); + } + if (_neg_inf_found) { + return -std::numeric_limits::infinity(); + } + return boost::accumulators::mean(accumulator); +} + +template +inline double PixelAggregator::extract_variance(const Accumulator& accumulator) const { + if (_nan_found || _pos_inf_found || _neg_inf_found) { + return std::numeric_limits::quiet_NaN(); + } + return boost::accumulators::variance(accumulator); +} + +template +inline double PixelAggregator::extract_skewness(const Accumulator& accumulator) const { + if (_nan_found || _pos_inf_found || _neg_inf_found) { + return std::numeric_limits::quiet_NaN(); + } + return boost::accumulators::skewness(accumulator); +} + +template +inline double PixelAggregator::extract_kurtosis(const Accumulator& accumulator) const { + if (_nan_found || _pos_inf_found || _neg_inf_found) { + return std::numeric_limits::quiet_NaN(); + } + if (_kurtosis_accumulator.has_value()) { + return boost::accumulators::kurtosis(*_kurtosis_accumulator); + } + return boost::accumulators::kurtosis(accumulator); +} + +} // namespace hictkpy From 977df751f03b4cd7c2ddc7a6f229fd26d5e8898a Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Nov 2024 22:33:41 +0100 Subject: [PATCH 6/6] Fix MSVC builds --- src/pixel_aggregator_impl.hpp | 49 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/src/pixel_aggregator_impl.hpp b/src/pixel_aggregator_impl.hpp index 8b37e1a..8fcffdf 100644 --- a/src/pixel_aggregator_impl.hpp +++ b/src/pixel_aggregator_impl.hpp @@ -93,23 +93,21 @@ inline void PixelAggregator::validate_metrics(const phmap::flat_hash_set inline void PixelAggregator::process_non_finite(N n) noexcept { static_assert(std::is_arithmetic_v); - if constexpr (std::is_integral_v) { - return; - } - - if constexpr (keep_nans) { - if (HICTKPY_UNLIKELY(std::isnan(n))) { - _nan_found = true; - return; + if constexpr (std::is_floating_point_v) { + if constexpr (keep_nans) { + if (HICTKPY_UNLIKELY(std::isnan(n))) { + _nan_found = true; + return; + } } - } - if constexpr (keep_infs) { - if (HICTKPY_UNLIKELY(std::isinf(n))) { - if (n > 0) { - _pos_inf_found = true; - } else { - _neg_inf_found = true; + if constexpr (keep_infs) { + if (HICTKPY_UNLIKELY(std::isinf(n))) { + if (n > 0) { + _pos_inf_found = true; + } else { + _neg_inf_found = true; + } } } } @@ -214,16 +212,17 @@ inline bool PixelAggregator::drop_value([[maybe_unused]] N n) noexcept { static_assert(std::is_arithmetic_v); if constexpr (!std::is_floating_point_v) { return false; - } - - if constexpr (!keep_nans && !keep_infs) { - return !std::isfinite(n); - } - if constexpr (!keep_nans) { - return std::isnan(n); - } - if constexpr (!keep_infs) { - return std::isinf(n); + } else { + // MSVC gets confused if this chunk of code is not in an else branch... + if constexpr (!keep_nans && !keep_infs) { + return !std::isfinite(n); + } + if constexpr (!keep_nans) { + return std::isnan(n); + } + if constexpr (!keep_infs) { + return std::isinf(n); + } } return false;