Commit c7fcf2ab authored by Davis King's avatar Davis King

Added binomial_random_vars_are_different() and event_correlation().

parent 126b4b76
......@@ -1818,6 +1818,71 @@ namespace dlib
serialize(item.pca, out);
}
// ----------------------------------------------------------------------------------------
inline double binomial_random_vars_are_different (
uint64_t k1,
uint64_t n1,
uint64_t k2,
uint64_t n2
)
{
DLIB_ASSERT(k1 <= n1, "k1: "<< k1 << " n1: "<< n1);
DLIB_ASSERT(k2 <= n2, "k2: "<< k2 << " n2: "<< n2);
const double p1 = k1/(double)n1;
const double p2 = k2/(double)n2;
const double p = (k1+k2)/(double)(n1+n2);
auto ll = [](double p, uint64_t k, uint64_t n) {
if (p == 0 || p == 1)
return 0.0;
return k*std::log(p) + (n-k)*std::log(1-p);
};
return ll(p1,k1,n1) + ll(p2,k2,n2) - ll(p,k1,n1) - ll(p,k2,n2);
}
// ----------------------------------------------------------------------------------------
inline double event_correlation (
uint64_t A_count,
uint64_t B_count,
uint64_t AB_count,
uint64_t total_num_observations
)
{
DLIB_ASSERT(AB_count <= A_count && A_count <= total_num_observations,
"AB_count: " << AB_count << ", A_count: "<< A_count << ", total_num_observations: " << total_num_observations);
DLIB_ASSERT(AB_count <= B_count && B_count <= total_num_observations,
"AB_count: " << AB_count << ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations);
if (total_num_observations == 0)
return 0;
DLIB_ASSERT(A_count + B_count - AB_count <= total_num_observations,
"AB_count: " << AB_count << " A_count: " << A_count << ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations);
const auto AnotB_count = A_count - AB_count;
const auto notB_count = total_num_observations - B_count;
// How likely is it that the odds of A happening is different when conditioned on
// whether or not B happened?
const auto cor = binomial_random_vars_are_different(
AB_count, B_count, // A conditional on the presence of B
AnotB_count, notB_count // A conditional on the absence of B
);
// Check if there are more or less co-occurrences than expected (if A and B were
// unrelated) and use that to give the return value its sign.
const double expected_AB_count_if_unrelated = (A_count/(double)total_num_observations)*B_count;
if (AB_count >= expected_AB_count_if_unrelated)
return cor;
else
return -cor;
}
// ----------------------------------------------------------------------------------------
}
......
......@@ -105,6 +105,73 @@ namespace dlib
(i.e. mean(squared(mat(a)-mat(b))))
!*/
// ----------------------------------------------------------------------------------------
double binomial_random_vars_are_different (
uint64_t k1,
uint64_t n1,
uint64_t k2,
uint64_t n2
);
/*!
requires
- k1 <= n1
- k2 <= n2
ensures
- Given two Binomially distributed random variables, X1 and X2, we want to know
if these variables have the same parameter (i.e. the chance of "success").
So assume that:
- You observed X1 to give k1 successes out of n1 trials.
- You observed X2 to give k2 successes out of n2 trials.
- This function performs a simple likelihood ratio test to determine if X1 and
X2 have the same parameter. The return value of this function will be 0 if
they are probably the same or it will be some positive number otherwise.
Moreover, the larger the return value the more likely it is that X1 and X2
have different distributions.
- For a discussion of the technique and applications see:
Dunning, Ted. "Accurate methods for the statistics of surprise and
coincidence." Computational linguistics 19.1 (1993): 61-74.
!*/
// ----------------------------------------------------------------------------------------
double event_correlation (
uint64_t A_count,
uint64_t B_count,
uint64_t AB_count,
uint64_t total_num_observations
);
/*!
requires
- AB_count <= A_count <= total_num_observations
- AB_count <= B_count <= total_num_observations
- A_count + B_count - AB_count <= total_num_observations
ensures
- This function does a statistical test to determine if two events co-occur in
a statistically significant way. In particular, we assume you performed
total_num_observations measurements and during those measurements you:
- Observed event A to happen A_count times.
- Observed event B to happen B_count times.
- Observed AB_count co-occurrences of the events. That is, AB_count is the
number of times the events happened together during the same measurement.
- This function returns a number, COR, which can take any real value. It has
the following interpretations:
- COR == 0: there is no evidence of correlation between the two events.
They appear to be unrelated.
- COR > 0: There is evidence that A and B co-occur together. That is,
they happen at the same times more often than you would expect if they
were independent events. The larger the magnitude of COR the more
evidence we have for the correlation.
- COR < 0: There is evidence that A and B are anti-correlated. That is,
when A happens B is unlikely to happen and vise versa. The larger the
magnitude of COR the more evidence we have for the anti-correlation.
- This function implements the simple likelihood ratio test discussed in the
following paper:
Dunning, Ted. "Accurate methods for the statistics of surprise and
coincidence." Computational linguistics 19.1 (1993): 61-74.
So for an extended discussion of the method see the above paper.
!*/
// ----------------------------------------------------------------------------------------
template <
......
......@@ -803,6 +803,22 @@ namespace
DLIB_TEST(equal_error_rate(vals2, vals1).first == 1);
}
void test_event_corr()
{
print_spinner();
DLIB_TEST(event_correlation(1000,1000,500,2000) == 0);
DLIB_TEST(std::abs(event_correlation(1000,1000,300,2000) + 164.565757010104) < 1e-11);
DLIB_TEST(std::abs(event_correlation(1000,1000,700,2000) - 164.565757010104) < 1e-11);
DLIB_TEST(event_correlation(10,1000,5,2000) == 0);
DLIB_TEST(event_correlation(1000,10,5,2000) == 0);
DLIB_TEST(std::abs(event_correlation(10,1000,1,2000) - event_correlation(1000,10,1,2000)) < 1e-11);
DLIB_TEST(std::abs(event_correlation(10,1000,9,2000) - event_correlation(1000,10,9,2000)) < 1e-11);
DLIB_TEST(std::abs(event_correlation(10,1000,1,2000) + 3.69672251700842) < 1e-11);
DLIB_TEST(std::abs(event_correlation(10,1000,9,2000) - 3.69672251700842) < 1e-11);
}
void perform_test (
)
{
......@@ -824,6 +840,7 @@ namespace
another_test();
test_average_precision();
test_lda();
test_event_corr();
}
} a;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment