Added binomial_random_vars_are_different() and event_correlation().

c7fcf2ab · Davis King · 126b4b76 · c7fcf2ab · c7fcf2ab · c7fcf2ab
Commit c7fcf2ab authored Jan 24, 2018 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 149 additions and 0 deletions

statistics.h dlib/statistics/statistics.h +65 -0

statistics_abstract.h dlib/statistics/statistics_abstract.h +67 -0

statistics.cpp dlib/test/statistics.cpp +17 -0

No files found.
--- a/dlib/statistics/statistics.h
+++ b/dlib/statistics/statistics.h
@@ -1818,6 +1818,71 @@ namespace dlib
        serialize(item.pca, out);
    }
+// ----------------------------------------------------------------------------------------
+    inline double binomial_random_vars_are_different (
+        uint64_t k1,
+        uint64_t n1,
+        uint64_t k2,
+        uint64_t n2
+    )
+    {
+        DLIB_ASSERT(k1 <= n1, "k1: "<< k1 << "  n1: "<< n1);
+        DLIB_ASSERT(k2 <= n2, "k2: "<< k2 << "  n2: "<< n2);
+        const double p1 = k1/(double)n1;
+        const double p2 = k2/(double)n2;
+        const double p = (k1+k2)/(double)(n1+n2);
+        auto ll = [](double p, uint64_t k, uint64_t n) {      
+            if (p == 0 || p == 1)
+                return 0.0;
+            return k*std::log(p) + (n-k)*std::log(1-p);
+        };
+        return ll(p1,k1,n1) + ll(p2,k2,n2) - ll(p,k1,n1) - ll(p,k2,n2); 
+    }
+// ----------------------------------------------------------------------------------------
+    inline double event_correlation (
+        uint64_t A_count,
+        uint64_t B_count,
+        uint64_t AB_count,
+        uint64_t total_num_observations
+    )
+    {
+        DLIB_ASSERT(AB_count <= A_count && A_count <= total_num_observations,
+            "AB_count: " << AB_count << ", A_count: "<< A_count << ", total_num_observations: " << total_num_observations);
+        DLIB_ASSERT(AB_count <= B_count && B_count <= total_num_observations,
+            "AB_count: " << AB_count << ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations);
+        if (total_num_observations == 0)
+            return 0;
+        DLIB_ASSERT(A_count + B_count - AB_count <= total_num_observations,
+            "AB_count: " << AB_count << " A_count: " << A_count <<  ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations);
+        const auto AnotB_count = A_count - AB_count;
+        const auto notB_count = total_num_observations - B_count;
+        // How likely is it that the odds of A happening is different when conditioned on
+        // whether or not B happened?
+        const auto cor =  binomial_random_vars_are_different( 
+            AB_count, B_count,      // A conditional on the presence of B
+            AnotB_count, notB_count // A conditional on the absence of B 
+        );
+        // Check if there are more or less co-occurrences than expected (if A and B were
+        // unrelated) and use that to give the return value its sign.
+        const double expected_AB_count_if_unrelated = (A_count/(double)total_num_observations)*B_count;
+        if (AB_count >= expected_AB_count_if_unrelated)
+            return cor;
+        else
+            return -cor;
+    }
 // ----------------------------------------------------------------------------------------
 }

--- a/dlib/statistics/statistics_abstract.h
+++ b/dlib/statistics/statistics_abstract.h
@@ -105,6 +105,73 @@ namespace dlib
              (i.e. mean(squared(mat(a)-mat(b))))
    !*/
+// ----------------------------------------------------------------------------------------
+    double binomial_random_vars_are_different (
+        uint64_t k1,
+        uint64_t n1,
+        uint64_t k2,
+        uint64_t n2
+    );
+    /*!
+        requires
+            - k1 <= n1
+            - k2 <= n2
+        ensures
+            - Given two Binomially distributed random variables, X1 and X2, we want to know
+              if these variables have the same parameter (i.e. the chance of "success").
+              So assume that:
+                - You observed X1 to give k1 successes out of n1 trials.
+                - You observed X2 to give k2 successes out of n2 trials.
+            - This function performs a simple likelihood ratio test to determine if X1 and
+              X2 have the same parameter.  The return value of this function will be 0 if
+              they are probably the same or it will be some positive number otherwise.
+              Moreover, the larger the return value the more likely it is that X1 and X2
+              have different distributions.
+            - For a discussion of the technique and applications see:
+                  Dunning, Ted. "Accurate methods for the statistics of surprise and
+                  coincidence." Computational linguistics 19.1 (1993): 61-74.
+    !*/
+// ----------------------------------------------------------------------------------------
+    double event_correlation (
+        uint64_t A_count,
+        uint64_t B_count,
+        uint64_t AB_count,
+        uint64_t total_num_observations
+    );
+    /*!
+        requires
+            - AB_count <= A_count <= total_num_observations
+            - AB_count <= B_count <= total_num_observations
+            - A_count + B_count - AB_count <= total_num_observations
+        ensures
+            - This function does a statistical test to determine if two events co-occur in
+              a statistically significant way.  In particular, we assume you performed
+              total_num_observations measurements and during those measurements you:
+                - Observed event A to happen A_count times.
+                - Observed event B to happen B_count times.
+                - Observed AB_count co-occurrences of the events.  That is, AB_count is the
+                  number of times the events happened together during the same measurement.
+            - This function returns a number, COR, which can take any real value.  It has
+              the following interpretations:
+                - COR == 0: there is no evidence of correlation between the two events.
+                  They appear to be unrelated.
+                - COR > 0: There is evidence that A and B co-occur together.  That is,
+                  they happen at the same times more often than you would expect if they
+                  were independent events.  The larger the magnitude of COR the more
+                  evidence we have for the correlation.
+                - COR < 0: There is evidence that A and B are anti-correlated.  That is,
+                  when A happens B is unlikely to happen and vise versa.  The larger the
+                  magnitude of COR the more evidence we have for the anti-correlation.
+            - This function implements the simple likelihood ratio test discussed in the
+              following paper:
+                  Dunning, Ted. "Accurate methods for the statistics of surprise and
+                  coincidence." Computational linguistics 19.1 (1993): 61-74.
+              So for an extended discussion of the method see the above paper.
+    !*/
 // ----------------------------------------------------------------------------------------
    template <

--- a/dlib/test/statistics.cpp
+++ b/dlib/test/statistics.cpp
@@ -803,6 +803,22 @@ namespace
            DLIB_TEST(equal_error_rate(vals2, vals1).first == 1);
        }
+        void test_event_corr()
+        {
+            print_spinner();
+            DLIB_TEST(event_correlation(1000,1000,500,2000) == 0);
+            DLIB_TEST(std::abs(event_correlation(1000,1000,300,2000) + 164.565757010104) < 1e-11);
+            DLIB_TEST(std::abs(event_correlation(1000,1000,700,2000) - 164.565757010104) < 1e-11);
+            DLIB_TEST(event_correlation(10,1000,5,2000) == 0);
+            DLIB_TEST(event_correlation(1000,10,5,2000) == 0);
+            DLIB_TEST(std::abs(event_correlation(10,1000,1,2000) - event_correlation(1000,10,1,2000)) < 1e-11);
+            DLIB_TEST(std::abs(event_correlation(10,1000,9,2000) - event_correlation(1000,10,9,2000)) < 1e-11);
+            DLIB_TEST(std::abs(event_correlation(10,1000,1,2000) + 3.69672251700842) < 1e-11);
+            DLIB_TEST(std::abs(event_correlation(10,1000,9,2000) - 3.69672251700842) < 1e-11);
+        }
        void perform_test (
        )
        {
@@ -824,6 +840,7 @@ namespace
            another_test();
            test_average_precision();
            test_lda();
+            test_event_corr();
        }
    } a;