Commit 6cad6741 authored by Davis King's avatar Davis King

Added the random_subset_selector object.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403413
parent c88d8b12
......@@ -5,6 +5,7 @@
#include "statistics/statistics.h"
#include "statistics/dpca.h"
#include "statistics/random_subset_selector.h"
#endif // DLIB_STATISTICs_H_
......
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_RANDOM_SUBSeT_SELECTOR_H_
#define DLIB_RANDOM_SUBSeT_SELECTOR_H_
#include "random_subset_selector_abstract.h"
#include "../rand.h"
#include <vector>
#include "../algs.h"
#include "../memory_manager.h"
namespace dlib
{
template <
typename T,
typename Rand_type = dlib::rand::kernel_1a
>
class random_subset_selector
{
/*!
INITIAL VALUE
- _max_size == 0
- items.size() == 0
- count == 0
CONVENTION
- count == the number of times add() has been called since the last
time this object was empty.
- items.size() == size()
- max_size() == _max_size
!*/
public:
typedef T type;
typedef memory_manager<char>::kernel_1a mem_manager_type;
typedef Rand_type rand_type;
typedef typename std::vector<T>::iterator iterator;
typedef typename std::vector<T>::const_iterator const_iterator;
random_subset_selector (
)
{
_max_size = 0;
make_empty();
}
void set_seed(const std::string& value)
{
rnd.set_seed(value);
}
void make_empty (
)
{
items.resize(0);
count = 0;
}
unsigned long size (
) const
{
return items.size();
}
void set_max_size (
unsigned long new_max_size
)
{
items.reserve(new_max_size);
make_empty();
_max_size = new_max_size;
}
unsigned long max_size (
) const
{
return _max_size;
}
T& operator[] (
unsigned long idx
)
{
return items[idx];
}
const T& operator[] (
unsigned long idx
) const
{
return items[idx];
}
iterator begin() { return items.begin(); }
const_iterator begin() const { return items.begin(); }
iterator end() { return items.end(); }
const_iterator end() const { return items.end(); }
void add (
const T& new_item
)
{
if (items.size() < _max_size)
{
items.push_back(new_item);
// swap into a random place
exchange(items[rnd.get_random_32bit_number()%items.size()], items.back());
}
else
{
// At this point each element of items has had an equal chance of being in this object.
// In particular, the probability that each arrived here is currently items.size()/count.
// We need to be able to say that, after this function ends, the probability of any
// particular object ending up in items is items.size()/(count+1). So this means that
// we should decide to add new_item into items with this probability. If we do so then
// we pick one of the current items and replace it at random with new_item.
// Make me a random 64 bit number. This might seem excessive but I want this object
// to be able to handle an effectively infinite number of calls to add(). So count
// might get very large and we need to deal with that properly.
const unsigned long num1 = rnd.get_random_32bit_number();
const unsigned long num2 = rnd.get_random_32bit_number();
uint64 num = num1;
num <<= 32;
num |= num2;
num %= (count+1);
if (num < items.size())
{
// pick a random element of items and replace it.
items[rnd.get_random_32bit_number()%items.size()] = new_item;
}
}
++count;
}
void swap (
random_subset_selector& a
)
{
a.swap(a.items);
std::swap(_max_size, a._max_size);
std::swap(count, a.count);
rnd.swap(a.rnd);
}
private:
std::vector<T> items;
unsigned long _max_size;
uint64 count;
rand_type rnd;
};
template <
typename T,
typename rand_type
>
void swap (
random_subset_selector<T,rand_type>& a,
random_subset_selector<T,rand_type>& b
) { a.swap(b); }
}
#endif // DLIB_RANDOM_SUBSeT_SELECTOR_H_
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_
#ifdef DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_
#include <vector>
#include "../rand.h"
#include "../memory_manager.h"
namespace dlib
{
template <
typename T,
typename Rand_type = dlib::rand::kernel_1a
>
class random_subset_selector
{
/*!
REQUIREMENTS ON T
T must be a copyable type
REQUIREMENTS ON Rand_type
must be an implementation of dlib/rand/rand_kernel_abstract.h
INITIAL VALUE
- size() == 0
- max_size() == 0
WHAT THIS OBJECT REPRESENTS
This object is a tool to help you select a random subset of a large body of data.
In particular, it is useful when the body of data is too large to fit into memory.
So for example, suppose you have 1000000 data samples and you want to select a
random subset of size 1000. Then you could do that as follows:
random_subset_selector<sample_type> rand_subset;
rand_subset.set_max_size(1000)
for (int i = 0; i < 1000000; ++i)
rand_subset.add( get_next_data_sample());
At the end of the for loop you will have your random subset of 1000 samples. And by
random I mean that each of the 1000000 data samples has an equal change of ending
up in the rand_subset object.
!*/
public:
typedef T type;
typedef memory_manager<char>::kernel_1a mem_manager_type;
typedef Rand_type rand_type;
typedef typename std::vector<T>::iterator iterator;
typedef typename std::vector<T>::const_iterator const_iterator;
random_subset_selector (
);
/*!
ensures
- this object is properly initialized
!*/
void set_seed(
const std::string& value
);
/*!
ensures
- sets the seed of the random number generator that is embedded in
this object to the given value.
!*/
void make_empty (
);
/*!
ensures
- #size() == 0
!*/
unsigned long size (
) const;
/*!
ensures
- returns the number of items of type T currently contained in this object
!*/
void set_max_size (
unsigned long new_max_size
);
/*!
ensures
- #max_size() == new_max_size
- #size() == 0
!*/
unsigned long max_size (
) const;
/*!
ensures
- returns the maximum allowable size for this object
!*/
T& operator[] (
unsigned long idx
);
/*!
requires
- idx < size()
ensures
- returns a non-const reference to the idx'th element of this object
!*/
const T& operator[] (
unsigned long idx
) const;
/*!
requires
- idx < size()
ensures
- returns a const reference to the idx'th element of this object
!*/
void add (
const T& new_item
);
/*!
ensures
- if (size() < max_size()) then
- #size() == size() + 1
- places new_item into *this object at a random location
- else
- randomly does one of the following:
- ignores new_item and makes no change
- replaces a random element of *this with new_item
!*/
iterator begin(
);
/*!
ensures
- if (size() > 0) then
- returns an iterator referring to the first element in
this container.
- else
- returns end()
!*/
const_iterator begin(
) const;
/*!
ensures
- if (size() > 0) then
- returns a const_iterator referring to the first element in
this container.
- else
- returns end()
!*/
iterator end(
);
/*!
ensures
- returns an iterator that represents one past the end of
this container
!*/
const_iterator end(
) const;
/*!
ensures
- returns an iterator that represents one past the end of
this container
!*/
void swap (
random_subset_selector& item
);
/*!
ensures
- swaps *this and item
!*/
};
template <
typename T,
typename rand_type
>
void swap (
random_subset_selector<T,rand_type>& a,
random_subset_selector<T,rand_type>& b
) { a.swap(b); }
/*!
provides global swap support
!*/
}
#endif // DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_
......@@ -74,6 +74,7 @@ set (tests
stack.cpp
static_map.cpp
static_set.cpp
statistics.cpp
std_vector_c.cpp
string.cpp
svm.cpp
......
......@@ -84,6 +84,7 @@ SRC += sockstreambuf.cpp
SRC += stack.cpp
SRC += static_map.cpp
SRC += static_set.cpp
SRC += statistics.cpp
SRC += std_vector_c.cpp
SRC += string.cpp
SRC += svm.cpp
......
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#include <sstream>
#include <string>
#include <cstdlib>
#include <ctime>
#include <dlib/statistics.h>
#include "tester.h"
namespace
{
using namespace test;
using namespace dlib;
using namespace std;
logger dlog("test.statistics");
class statistics_tester : public tester
{
public:
statistics_tester (
) :
tester ("test_statistics",
"Runs tests on the statistics component.")
{}
void test_random_subset_selector ()
{
random_subset_selector<double> rand_set;
for (int j = 0; j < 30; ++j)
{
print_spinner();
running_stats<double> rs, rs2;
rand_set.set_max_size(1000);
for (double i = 0; i < 100000; ++i)
{
rs.add(i);
rand_set.add(i);
}
for (unsigned long i = 0; i < rand_set.size(); ++i)
rs2.add(rand_set[i]);
dlog << LDEBUG << "true mean: " << rs.mean();
dlog << LDEBUG << "true sampled: " << rs2.mean();
double ratio = rs.mean()/rs2.mean();
DLIB_TEST_MSG(0.96 < ratio && ratio < 1.04, " ratio: " << ratio);
}
}
void perform_test (
)
{
test_random_subset_selector();
}
} a;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment