Commit aa74346e authored by Davis King's avatar Davis King

Added some missing DLIB_ASSERTs to the random_subset_selector. I also gave it

an empty add() and a next_add_accepts() so that the user doesn't have to load
data samples that wouldn't get selected for inclusion in the random subset.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403424
parent ba1d50cc
...@@ -22,12 +22,14 @@ namespace dlib ...@@ -22,12 +22,14 @@ namespace dlib
- _max_size == 0 - _max_size == 0
- items.size() == 0 - items.size() == 0
- count == 0 - count == 0
- _next_add_accepts == false
CONVENTION CONVENTION
- count == the number of times add() has been called since the last - count == the number of times add() has been called since the last
time this object was empty. time this object was empty.
- items.size() == size() - items.size() == size()
- max_size() == _max_size - max_size() == _max_size
- next_add_accepts() == _next_add_accepts
!*/ !*/
public: public:
typedef T type; typedef T type;
...@@ -55,6 +57,7 @@ namespace dlib ...@@ -55,6 +57,7 @@ namespace dlib
{ {
items.resize(0); items.resize(0);
count = 0; count = 0;
update_next_add_accepts();
} }
unsigned long size ( unsigned long size (
...@@ -70,6 +73,7 @@ namespace dlib ...@@ -70,6 +73,7 @@ namespace dlib
items.reserve(new_max_size); items.reserve(new_max_size);
make_empty(); make_empty();
_max_size = new_max_size; _max_size = new_max_size;
update_next_add_accepts();
} }
unsigned long max_size ( unsigned long max_size (
...@@ -82,6 +86,15 @@ namespace dlib ...@@ -82,6 +86,15 @@ namespace dlib
unsigned long idx unsigned long idx
) )
{ {
// make sure requires clause is not broken
DLIB_ASSERT(idx < size(),
"\tvoid random_subset_selector::operator[]()"
<< "\n\t idx is out of range"
<< "\n\t idx: " << idx
<< "\n\t size(): " << size()
<< "\n\t this: " << this
);
return items[idx]; return items[idx];
} }
...@@ -89,6 +102,15 @@ namespace dlib ...@@ -89,6 +102,15 @@ namespace dlib
unsigned long idx unsigned long idx
) const ) const
{ {
// make sure requires clause is not broken
DLIB_ASSERT(idx < size(),
"\tvoid random_subset_selector::operator[]()"
<< "\n\t idx is out of range"
<< "\n\t idx: " << idx
<< "\n\t size(): " << size()
<< "\n\t this: " << this
);
return items[idx]; return items[idx];
} }
...@@ -97,6 +119,12 @@ namespace dlib ...@@ -97,6 +119,12 @@ namespace dlib
iterator end() { return items.end(); } iterator end() { return items.end(); }
const_iterator end() const { return items.end(); } const_iterator end() const { return items.end(); }
bool next_add_accepts (
) const
{
return _next_add_accepts;
}
void add ( void add (
const T& new_item const T& new_item
) )
...@@ -107,14 +135,62 @@ namespace dlib ...@@ -107,14 +135,62 @@ namespace dlib
// swap into a random place // swap into a random place
exchange(items[rnd.get_random_32bit_number()%items.size()], items.back()); exchange(items[rnd.get_random_32bit_number()%items.size()], items.back());
} }
else if (_next_add_accepts)
{
// pick a random element of items and replace it.
items[rnd.get_random_32bit_number()%items.size()] = new_item;
}
update_next_add_accepts();
++count;
}
void add (
)
{
// make sure requires clause is not broken
DLIB_ASSERT(next_add_accepts() == false,
"\tvoid random_subset_selector::add()"
<< "\n\t You should be calling the version of add() that takes an argument"
<< "\n\t this: " << this
);
update_next_add_accepts();
++count;
}
void swap (
random_subset_selector& a
)
{
a.swap(a.items);
std::swap(_max_size, a._max_size);
std::swap(count, a.count);
rnd.swap(a.rnd);
std::swap(_next_add_accepts, a._next_add_accepts);
}
private:
void update_next_add_accepts (
)
{
if (items.size() < _max_size)
{
_next_add_accepts = true;
}
else if (_max_size == 0)
{
_next_add_accepts = false;
}
else else
{ {
// At this point each element of items has had an equal chance of being in this object. // At this point each element of items has had an equal chance of being in this object.
// In particular, the probability that each arrived here is currently items.size()/count. // In particular, the probability that each arrived here is currently items.size()/count.
// We need to be able to say that, after this function ends, the probability of any // We need to be able to say that, after this function ends, the probability of any
// particular object ending up in items is items.size()/(count+1). So this means that // particular object ending up in items is items.size()/(count+1). So this means that
// we should decide to add new_item into items with this probability. If we do so then // we should decide to add a new item into items with this probability. Also, if we do
// we pick one of the current items and replace it at random with new_item. // so then we pick one of the current items and replace it at random with the new item.
// Make me a random 64 bit number. This might seem excessive but I want this object // Make me a random 64 bit number. This might seem excessive but I want this object
// to be able to handle an effectively infinite number of calls to add(). So count // to be able to handle an effectively infinite number of calls to add(). So count
...@@ -127,34 +203,19 @@ namespace dlib ...@@ -127,34 +203,19 @@ namespace dlib
num %= (count+1); num %= (count+1);
if (num < items.size()) _next_add_accepts = (num < items.size());
{
// pick a random element of items and replace it.
items[rnd.get_random_32bit_number()%items.size()] = new_item;
}
}
++count;
} }
void swap (
random_subset_selector& a
)
{
a.swap(a.items);
std::swap(_max_size, a._max_size);
std::swap(count, a.count);
rnd.swap(a.rnd);
} }
private:
std::vector<T> items; std::vector<T> items;
unsigned long _max_size; unsigned long _max_size;
uint64 count; uint64 count;
rand_type rnd; rand_type rnd;
bool _next_add_accepts;
}; };
template < template <
......
...@@ -25,6 +25,7 @@ namespace dlib ...@@ -25,6 +25,7 @@ namespace dlib
INITIAL VALUE INITIAL VALUE
- size() == 0 - size() == 0
- max_size() == 0 - max_size() == 0
- next_add_accepts() == false
WHAT THIS OBJECT REPRESENTS WHAT THIS OBJECT REPRESENTS
This object is a tool to help you select a random subset of a large body of data. This object is a tool to help you select a random subset of a large body of data.
...@@ -43,6 +44,24 @@ namespace dlib ...@@ -43,6 +44,24 @@ namespace dlib
random I mean that each of the 1000000 data samples has an equal change of ending random I mean that each of the 1000000 data samples has an equal change of ending
up in the rand_subset object. up in the rand_subset object.
Note that the above example calls get_next_data_sample() for each data sample. This
may be inefficient since most of the data samples are just ignored. An alternative
method that doesn't require you to load each sample can also be used. Consider the
following:
random_subset_selector<sample_type> rand_subset;
rand_subset.set_max_size(1000)
for (int i = 0; i < 1000000; ++i)
if (rand_subset.next_add_accepts())
rand_subset.add(get_data_sample(i));
else
rand_subset.add()
In the above example we only actually fetch the data sample into memory if we
know that the rand_subset would include it into the random subset. Otherwise,
we can just call the empty add().
!*/ !*/
public: public:
typedef T type; typedef T type;
...@@ -118,18 +137,39 @@ namespace dlib ...@@ -118,18 +137,39 @@ namespace dlib
- returns a const reference to the idx'th element of this object - returns a const reference to the idx'th element of this object
!*/ !*/
bool next_add_accepts (
) const;
/*!
ensures
- if (the next call to add(item) will result in item being included
into *this) then
- returns true
- Note that the next item will always be accepted if size() < max_size().
- else
- returns false
- Note that the next item will never be accepted if max_size() == 0.
!*/
void add ( void add (
const T& new_item const T& new_item
); );
/*! /*!
ensures ensures
- if (next_add_accepts()) then
- places new_item into *this object at a random location
- if (size() < max_size()) then - if (size() < max_size()) then
- #size() == size() + 1 - #size() == size() + 1
- places new_item into *this object at a random location - #next_add_accepts() == The updated information about the acceptance
- else of the next call to add()
- randomly does one of the following: !*/
- ignores new_item and makes no change
- replaces a random element of *this with new_item void add (
);
/*!
requires
- next_add_accepts() == false
ensures
- This function does nothing but update the value of #next_add_accepts()
!*/ !*/
iterator begin( iterator begin(
......
...@@ -60,10 +60,44 @@ namespace ...@@ -60,10 +60,44 @@ namespace
} }
} }
void test_random_subset_selector2 ()
{
random_subset_selector<double> rand_set;
for (int j = 0; j < 30; ++j)
{
print_spinner();
running_stats<double> rs, rs2;
rand_set.set_max_size(1000);
for (double i = 0; i < 100000; ++i)
{
rs.add(i);
if (rand_set.next_add_accepts())
rand_set.add(i);
else
rand_set.add();
}
for (unsigned long i = 0; i < rand_set.size(); ++i)
rs2.add(rand_set[i]);
dlog << LDEBUG << "true mean: " << rs.mean();
dlog << LDEBUG << "true sampled: " << rs2.mean();
double ratio = rs.mean()/rs2.mean();
DLIB_TEST_MSG(0.96 < ratio && ratio < 1.04, " ratio: " << ratio);
}
}
void perform_test ( void perform_test (
) )
{ {
test_random_subset_selector(); test_random_subset_selector();
test_random_subset_selector2();
} }
} a; } a;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment