diff --git a/LICENSE b/LICENSE index 2071b23..913f2fa 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) +Copyright (c) Zed A. Shaw Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/cpp/Makefile b/cpp/Makefile index cd6dd60..addb12a 100644 --- a/cpp/Makefile +++ b/cpp/Makefile @@ -1,3 +1,4 @@ +R_SCRIPT=C:/Users/lcthw/AppData/Local/Programs/R/R-4.6.0/bin/Rscript.exe all: build reset: @@ -19,7 +20,10 @@ debug_build: meson compile -j 10 -C builddir test: build - ./builddir/fuc2it + ./builddir/fuc2it -v + +r_test: + $(R_SCRIPT) tests/confirm.r run: build test ifeq '$(OS)' 'Windows_NT' diff --git a/cpp/src/stats.cpp b/cpp/src/stats.cpp index 3bd0039..c54fffd 100644 --- a/cpp/src/stats.cpp +++ b/cpp/src/stats.cpp @@ -1,7 +1,9 @@ #include "stats.hpp" +#include "welford.hpp" #include #include #include +#include void Stats::dump(std::string msg) { @@ -11,16 +13,26 @@ void Stats::dump(std::string msg) stddev()); } -// WARNING: got this code from google AI. PROBABLY BULLSHIT -void Stats::t_test(Stats& other) { - double n1 = this->n; - double n2 = other.n; +double simpson(double a, double b, uint32_t n, const std::function func) { + const double h = ( b - a ) / n; + double sum = 0.0; + for ( uint32_t i = 0; i < n; ++i ) { + const double x = a + i * h; + sum += ( func(x) + 4.0 * func(x + h / 2.0) + func(x + h) ) / 6.0; + } + return sum * h; +} + + +TTest internal_t_test(auto& first, auto& second) { + double n1 = first.n; + double n2 = second.n; - double mean1 = this->mean(); - double mean2 = other.mean(); + double mean1 = first.mean(); + double mean2 = second.mean(); - double var1 = this->variance(); - double var2 = other.variance(); + double var1 = first.sample_variance(); + double var2 = second.sample_variance(); // calculate t-stat double delta_mean = mean1 - mean2; @@ -30,12 +42,33 @@ void Stats::t_test(Stats& other) { // calculate degrees of freedom double num = std::pow((var1 / n1) + (var2 / n2), 2); double den = (std::pow(var1 / n1, 2) / (n1 - 1)) + (std::pow(var2 / n2, 2) / (n2 - 1)); - double df = num / den; + double dof = num / den; + + // welch calculation + double temp = first.sample_variance() / first.n + second.sample_variance() / second.n; + double welchs = (first.mean() - second.mean()) / std::sqrt(temp); + + double gamm = std::exp( + std::lgamma(dof / 2.0) + std::lgamma(0.5) + - std::lgamma(dof / 2.0 + 0.5)); + double b = dof / ( welchs * welchs + dof); - double x = df / (df + t_stat * t_stat); + auto func = [&](const double r) { + return std::pow(r, dof / 2.0 - 1.0) / std::sqrt(1.0 - r); + }; - double p_val = std::tgamma((df + 1.0) / 2.0) / (std::sqrt(df * std::numbers::pi) * std::tgamma(df / 2.0)) * std::pow(1.0 + (t_stat * t_stat) / df, -(df + 1.0) / 2.0); + double p_val = simpson(0.0, b, 10000, func) / gamm; + + return { + .t_stat = t_stat, + .dof = dof, + .p_val = p_val}; +} + +TTest Stats::t_test(Stats& other) { + return internal_t_test(*this, other); +} - fmt::println("t_stat={}, df={}, p_val={}", - t_stat, df, p_val); +TTest WelfordStats::t_test(WelfordStats& other) { + return internal_t_test(*this, other); } diff --git a/cpp/src/stats.hpp b/cpp/src/stats.hpp index 5d819a4..a0687e1 100644 --- a/cpp/src/stats.hpp +++ b/cpp/src/stats.hpp @@ -2,6 +2,12 @@ #include #include +struct TTest { + double t_stat = 0.0; + double dof = 0.0; + double p_val = 0.0; +}; + struct Stats { using TimeBullshit = std::chrono::time_point; @@ -28,6 +34,10 @@ struct Stats { } inline double variance() { + return (sumsq - (sum * sum / n)) / n; + } + + inline double sample_variance() { return (sumsq - (sum * sum / n)) / (n - 1); } @@ -61,5 +71,5 @@ struct Stats { void dump(std::string msg=""); - void t_test(Stats& other); + TTest t_test(Stats& other); }; diff --git a/cpp/tests/stats_tests.cpp b/cpp/tests/stats_tests.cpp index 0787632..2109dfa 100644 --- a/cpp/tests/stats_tests.cpp +++ b/cpp/tests/stats_tests.cpp @@ -2,19 +2,181 @@ #include #include #include +#include "stats.hpp" +#include "welford.hpp" +#include +#include using namespace fuc2; +std::random_device RNG; +std::mt19937 GENERATOR(RNG()); + +template +T uniform(T from, T to) { + std::uniform_int_distribution rand(from, to); + + return rand(GENERATOR); +} + +template +T uniform_real(T from, T to) { + std::uniform_real_distribution rand(from, to); + + return rand(GENERATOR); +} + namespace stats_tests { - void test_stats_stuff() { - CHECK(1 == 1); + std::vector generate_samples(size_t count) { + std::vector samples; + + // generate random numbers + for(size_t i = 0; i < count; i++) { + double num = uniform_real(0.0, 100000.0); + samples.push_back(num); + } + + return samples; + } + + void test_basic_stats() { + Stats stats; + auto samples = generate_samples(1000); + + // calculate it manually from the vector of numbers + double sum = 0.0; + double sumsq = 0.0; + for(auto num : samples) { + sum += num; + sumsq += num * num; + + stats.sample(num); + } + + double mean = sum / samples.size(); + + ALMOST_EQUAL(sum, stats.sum, 5); + ALMOST_EQUAL(sumsq, stats.sumsq, 5); + + // calculate mean and stddev using Stats + ALMOST_EQUAL(mean, stats.mean(), 5); + + double n = samples.size(); + // currently using variance, not sample_variance for stddev + double var = (sumsq - (sum * sum / n)) / n; + ALMOST_EQUAL(var, stats.variance(), 5); + + double stddev = std::sqrt(var); + ALMOST_EQUAL(stddev, stats.stddev(), 5); + } + + void test_welford() { + Stats stats; + WelfordStats welf; + + for(size_t i = 0; i < 10000; i++) { + double num = uniform_real(0.0, 1.0); + stats.sample(num); + welf.sample(num); + } + + ALMOST_EQUAL(stats.mean(), welf.mean(), 5); + ALMOST_EQUAL(stats.variance(), welf.variance(), 5); + ALMOST_EQUAL(stats.stddev(), welf.stddev(), 5); + } + + void test_timing_stats() { + Stats stats; + // need to find something to time... + } + + double round_to(double value, int decimal_places) { + double multiplier = std::pow(10.0, decimal_places); + return std::round(value * multiplier) / multiplier; + } + + void test_t_test() { + Stats stats; + Stats stats2; + WelfordStats welf; + WelfordStats welf2; + + std::string filename("samples.txt"); + std::fstream s{filename, s.binary | s.trunc | s.out}; + double skew_factor = 0.8; + + for(size_t i = 0; i < 10; i++) { + double num = uniform_real(-10.0, 10.0); + auto line = fmt::format("{} {}\n", num, num + skew_factor); + s << line; + + stats.sample(num); + stats2.sample(num + skew_factor); + + welf.sample(num); + welf2.sample(num + skew_factor); + } + + fmt::println("welford samples t-test:"); + welf.t_test(welf2); + + fmt::println("naive algebra samples t-test:"); + stats.t_test(stats2); + } + + void failing_bad_t_test() { + std::vector sample1{ + -1.94269, 3.95854, + 0.215857, -9.16792, + 0.939992, 1.38189, + 7.66882, -6.18898, + -1.12191, -9.95097, + }; + + std::vector sample2{ + -0.942695, 4.95854, + 1.21586, -8.16792, + 1.93999, 2.38189, + 8.66882, -5.18898, + -0.121914, -8.95097, + }; + + double expect_t =-0.39838; + double expect_dof = 18; + double expect_p_val =0.695; + + Stats stats1; + Stats stats2; + WelfordStats welf1; + WelfordStats welf2; + + for(double num : sample1) { + stats1.sample(num); + welf1.sample(num); + } + + for(double num : sample2) { + stats2.sample(num); + welf2.sample(num); + } + + auto stats_test = stats1.t_test(stats2); + auto welf_test = welf1.t_test(welf2); + + ALMOST_EQUAL(stats_test.t_stat, welf_test.t_stat, 5); + ALMOST_EQUAL(stats_test.dof, welf_test.dof, 1); + ALMOST_EQUAL(stats_test.p_val, welf_test.p_val, 5); } fuc2::Set TESTS{ .name="stats", .tests={ - TEST(test_stats_stuff), + TEST(failing_bad_t_test), + TEST(test_basic_stats), + TEST(test_timing_stats), + TEST(test_t_test), + TEST(test_welford), } }; } diff --git a/cpp/wraps/fuc2.wrap b/cpp/wraps/fuc2.wrap index 50fe0df..dc4fa27 100644 --- a/cpp/wraps/fuc2.wrap +++ b/cpp/wraps/fuc2.wrap @@ -1,5 +1,5 @@ [wrap-git] -directory=fuc2-0.1.0 +directory=fuc2-0.2.0 url=https://lcthw.dev/cpp/fuc2.git revision=HEAD depth=1