From fc55b22aff8ef165350236c967ae80fa7e741dae Mon Sep 17 00:00:00 2001 From: fbourgey Date: Fri, 13 Mar 2026 18:07:42 -0400 Subject: [PATCH 1/3] ENH: Add exact p-value computation for Cramer-von Mises two-sample test --- include/xsf/stats.h | 82 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/include/xsf/stats.h b/include/xsf/stats.h index 61bf7cfcd6..1e415c410a 100644 --- a/include/xsf/stats.h +++ b/include/xsf/stats.h @@ -16,6 +16,7 @@ #include "xsf/cephes/tukey.h" #include "xsf/erf.h" #include "xsf/gamma.h" +#include namespace xsf { @@ -284,6 +285,87 @@ inline double pdtrc(double k, double m) { return cephes::pdtrc(k, m); } inline double pdtri(int k, double y) { return cephes::pdtri(k, y); } +inline int64_t comb(int64_t n, int64_t k) { + // binomial coefficient n choose k, i.e. n! / (k! * (n - k)!) + if (k > n - k) + k = n - k; + int64_t result = 1; + for (int64_t i = 0; i < k; ++i) { + result = result * (n - i) / (i + 1); + } + return result; +} + +/* + * Compute the exact p-value of the Cramer-von Mises two-sample test + * for a given value s of the test statistic. + * m and n are the sizes of the samples. + * + * [1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for + * the Cramér-Von Mises Two-Sample Test", J. Stat. Soft., + * vol. 17, no. 8, pp. 1-15, Dec. 2006. + * [2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises + * Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist. + * 33(3), 1148-1159, (September, 1962) + */ +inline double pval_cvm_2samp_exact(double s, int64_t m, int64_t n) { + // [1, p. 3] + int64_t lcm = std::lcm(m, n); + // [1, p. 4], below eq. 3 + int64_t a = lcm / m; + int64_t b = lcm / n; + // Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$ + // Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2] + int64_t mn = m * n; + // Uses double floor division since s is double + int64_t zeta = std::floor((lcm * lcm * (m + n) * (6.0 * s - mn * (4.0 * mn - 1))) / (6.0 * mn * mn)); + int64_t combinations = comb(m + n, m); + // Each frequency table maps value -> frequency, + // mirroring the 2-row numpy array where row 0 = values, row 1 = frequencies + using FreqTable = std::map; + // the frequency table of g_{u, v}^+ defined in [1, p. 6] + // gs[0] = {0: 1}, gs[1..m] = empty + std::vector gs(m + 1); + gs[0][0] = 1; + for (int64_t u = 0; u < n + 1; ++u) { + std::vector next_gs; + FreqTable tmp; + for (int64_t v = 0; v < m + 1; ++v) { + // Calculate g recursively with eq. 11 in [1]. Even though it + // doesn't look like it, this also does 12/13 (all of Algorithm 1). + const FreqTable &g = gs[v]; + // Merge tmp and g: for common keys sum frequencies, + // keep unique keys from both sides. + // (equivalent to np.intersect1d + concatenate logic) + FreqTable merged; + for (const auto &[key, freq] : tmp) { + merged[key] += freq; + } + for (const auto &[key, freq] : g) { + merged[key] += freq; + } + int64_t diff = a * v - b * u; + int64_t res = diff * diff; + // tmp[0] += res (shift all keys by res) + tmp.clear(); + for (const auto &[key, freq] : merged) { + tmp[key + res] += freq; + } + next_gs.push_back(tmp); + } + gs = std::move(next_gs); + } + // (equivalent to return np.float64(np.sum(freq[value >= zeta]) / combinations)) + const FreqTable &final_table = gs[m]; + int64_t sum_freq = 0; + for (const auto &[value, freq] : final_table) { + if (value >= zeta) { + sum_freq += freq; + } + } + return static_cast(sum_freq) / static_cast(combinations); +} + inline double smirnov(int n, double x) { return cephes::smirnov(n, x); } inline double smirnovc(int n, double x) { return cephes::smirnovc(n, x); } From 507577a5d8c4bbc50dbb87785d636459cf2113c8 Mon Sep 17 00:00:00 2001 From: fbourgey Date: Fri, 13 Mar 2026 18:08:02 -0400 Subject: [PATCH 2/3] TST: Add tests --- tests/xsf_tests/test_pval_cvm_2samp_exact.cpp | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/xsf_tests/test_pval_cvm_2samp_exact.cpp diff --git a/tests/xsf_tests/test_pval_cvm_2samp_exact.cpp b/tests/xsf_tests/test_pval_cvm_2samp_exact.cpp new file mode 100644 index 0000000000..c7d1acea67 --- /dev/null +++ b/tests/xsf_tests/test_pval_cvm_2samp_exact.cpp @@ -0,0 +1,38 @@ +#include "../testing_utils.h" +#include + +/* +// Reference values computed with scipy.stats._hypotests._pval_cvm_2samp_exact + +import numpy as np +from scipy import stats + +rng = np.random.default_rng(seed=42) + +list_m = rng.integers(3, 30, size=5) +list_n = rng.integers(3, 30, size=5) +rtol = 1e-10 + +for m, n in zip(list_m, list_n): + x = rng.standard_normal(m) + y = rng.standard_normal(n) + res = stats.cramervonmises_2samp(x, y, method="exact") + T = res.statistic + # Convert normalized statistic T to the unnormalized U + U = m * n * (m + n) * T + m * n * (4 * m * n - 1) / 6 + p_value = stats._hypotests._pval_cvm_2samp_exact(U, m, n) + assert np.isclose(res.pvalue, p_value, rtol=rtol), "The p-values do not match!" + print(f"U={U}, m={m}, n={n}, p-value={p_value}") +*/ +TEST_CASE("pval_cvm_2samp_exact test", "[pval_cvm_2samp_exact][xsf_tests]") { + using test_case = std::tuple; + auto [s, m, n, pval_expected, rtol] = GENERATE( + test_case{12559.0, 5, 26, 0.11812654860485784, 1e-10}, test_case{8901.0, 23, 5, 0.9907610907610908, 1e-10}, + test_case{119376.0, 20, 21, 0.5716351061359124, 1e-10}, test_case{8862.0, 14, 8, 0.2679738562091503, 1e-10}, + test_case{3491.0000000000005, 14, 5, 0.34657722738218094, 1e-10} + ); + const auto pval = xsf::pval_cvm_2samp_exact(s, m, n); + const auto rel_error = xsf::extended_relative_error(pval, pval_expected); + CAPTURE(s, m, n, pval, pval_expected, rel_error); + REQUIRE(rel_error <= rtol); +} From 95b0163b1a0fabdcb28e7459b334d9428ee6f13a Mon Sep 17 00:00:00 2001 From: fbourgey Date: Sat, 14 Mar 2026 15:45:13 -0400 Subject: [PATCH 3/3] MAINT: use xsf/binom --- include/xsf/stats.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/include/xsf/stats.h b/include/xsf/stats.h index 1e415c410a..a875c044ba 100644 --- a/include/xsf/stats.h +++ b/include/xsf/stats.h @@ -1,6 +1,7 @@ #pragma once #include "xsf/bessel.h" +#include "xsf/binom.h" #include "xsf/cephes/bdtr.h" #include "xsf/cephes/chdtr.h" #include "xsf/cephes/fdtr.h" @@ -285,17 +286,6 @@ inline double pdtrc(double k, double m) { return cephes::pdtrc(k, m); } inline double pdtri(int k, double y) { return cephes::pdtri(k, y); } -inline int64_t comb(int64_t n, int64_t k) { - // binomial coefficient n choose k, i.e. n! / (k! * (n - k)!) - if (k > n - k) - k = n - k; - int64_t result = 1; - for (int64_t i = 0; i < k; ++i) { - result = result * (n - i) / (i + 1); - } - return result; -} - /* * Compute the exact p-value of the Cramer-von Mises two-sample test * for a given value s of the test statistic. @@ -319,7 +309,7 @@ inline double pval_cvm_2samp_exact(double s, int64_t m, int64_t n) { int64_t mn = m * n; // Uses double floor division since s is double int64_t zeta = std::floor((lcm * lcm * (m + n) * (6.0 * s - mn * (4.0 * mn - 1))) / (6.0 * mn * mn)); - int64_t combinations = comb(m + n, m); + int64_t combinations = static_cast(xsf::binom(static_cast(m + n), static_cast(m))); // Each frequency table maps value -> frequency, // mirroring the 2-row numpy array where row 0 = values, row 1 = frequencies using FreqTable = std::map;