Commit aa74346e authored by Davis King's avatar Davis King

Added some missing DLIB_ASSERTs to the random_subset_selector. I also gave it

an empty add() and a next_add_accepts() so that the user doesn't have to load
data samples that wouldn't get selected for inclusion in the random subset.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403424
parent ba1d50cc
......@@ -22,12 +22,14 @@ namespace dlib
- _max_size == 0
- items.size() == 0
- count == 0
- _next_add_accepts == false
CONVENTION
- count == the number of times add() has been called since the last
time this object was empty.
- items.size() == size()
- max_size() == _max_size
- next_add_accepts() == _next_add_accepts
!*/
public:
typedef T type;
......@@ -55,6 +57,7 @@ namespace dlib
{
items.resize(0);
count = 0;
update_next_add_accepts();
}
unsigned long size (
......@@ -70,6 +73,7 @@ namespace dlib
items.reserve(new_max_size);
make_empty();
_max_size = new_max_size;
update_next_add_accepts();
}
unsigned long max_size (
......@@ -82,6 +86,15 @@ namespace dlib
unsigned long idx
)
{
// make sure requires clause is not broken
DLIB_ASSERT(idx < size(),
"\tvoid random_subset_selector::operator[]()"
<< "\n\t idx is out of range"
<< "\n\t idx: " << idx
<< "\n\t size(): " << size()
<< "\n\t this: " << this
);
return items[idx];
}
......@@ -89,6 +102,15 @@ namespace dlib
unsigned long idx
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(idx < size(),
"\tvoid random_subset_selector::operator[]()"
<< "\n\t idx is out of range"
<< "\n\t idx: " << idx
<< "\n\t size(): " << size()
<< "\n\t this: " << this
);
return items[idx];
}
......@@ -97,6 +119,12 @@ namespace dlib
iterator end() { return items.end(); }
const_iterator end() const { return items.end(); }
bool next_add_accepts (
) const
{
return _next_add_accepts;
}
void add (
const T& new_item
)
......@@ -107,14 +135,62 @@ namespace dlib
// swap into a random place
exchange(items[rnd.get_random_32bit_number()%items.size()], items.back());
}
else if (_next_add_accepts)
{
// pick a random element of items and replace it.
items[rnd.get_random_32bit_number()%items.size()] = new_item;
}
update_next_add_accepts();
++count;
}
void add (
)
{
// make sure requires clause is not broken
DLIB_ASSERT(next_add_accepts() == false,
"\tvoid random_subset_selector::add()"
<< "\n\t You should be calling the version of add() that takes an argument"
<< "\n\t this: " << this
);
update_next_add_accepts();
++count;
}
void swap (
random_subset_selector& a
)
{
a.swap(a.items);
std::swap(_max_size, a._max_size);
std::swap(count, a.count);
rnd.swap(a.rnd);
std::swap(_next_add_accepts, a._next_add_accepts);
}
private:
void update_next_add_accepts (
)
{
if (items.size() < _max_size)
{
_next_add_accepts = true;
}
else if (_max_size == 0)
{
_next_add_accepts = false;
}
else
{
// At this point each element of items has had an equal chance of being in this object.
// In particular, the probability that each arrived here is currently items.size()/count.
// We need to be able to say that, after this function ends, the probability of any
// particular object ending up in items is items.size()/(count+1). So this means that
// we should decide to add new_item into items with this probability. If we do so then
// we pick one of the current items and replace it at random with new_item.
// we should decide to add a new item into items with this probability. Also, if we do
// so then we pick one of the current items and replace it at random with the new item.
// Make me a random 64 bit number. This might seem excessive but I want this object
// to be able to handle an effectively infinite number of calls to add(). So count
......@@ -127,34 +203,19 @@ namespace dlib
num %= (count+1);
if (num < items.size())
{
// pick a random element of items and replace it.
items[rnd.get_random_32bit_number()%items.size()] = new_item;
}
_next_add_accepts = (num < items.size());
}
++count;
}
void swap (
random_subset_selector& a
)
{
a.swap(a.items);
std::swap(_max_size, a._max_size);
std::swap(count, a.count);
rnd.swap(a.rnd);
}
private:
std::vector<T> items;
unsigned long _max_size;
uint64 count;
rand_type rnd;
bool _next_add_accepts;
};
template <
......
......@@ -25,6 +25,7 @@ namespace dlib
INITIAL VALUE
- size() == 0
- max_size() == 0
- next_add_accepts() == false
WHAT THIS OBJECT REPRESENTS
This object is a tool to help you select a random subset of a large body of data.
......@@ -43,6 +44,24 @@ namespace dlib
random I mean that each of the 1000000 data samples has an equal change of ending
up in the rand_subset object.
Note that the above example calls get_next_data_sample() for each data sample. This
may be inefficient since most of the data samples are just ignored. An alternative
method that doesn't require you to load each sample can also be used. Consider the
following:
random_subset_selector<sample_type> rand_subset;
rand_subset.set_max_size(1000)
for (int i = 0; i < 1000000; ++i)
if (rand_subset.next_add_accepts())
rand_subset.add(get_data_sample(i));
else
rand_subset.add()
In the above example we only actually fetch the data sample into memory if we
know that the rand_subset would include it into the random subset. Otherwise,
we can just call the empty add().
!*/
public:
typedef T type;
......@@ -118,18 +137,39 @@ namespace dlib
- returns a const reference to the idx'th element of this object
!*/
bool next_add_accepts (
) const;
/*!
ensures
- if (the next call to add(item) will result in item being included
into *this) then
- returns true
- Note that the next item will always be accepted if size() < max_size().
- else
- returns false
- Note that the next item will never be accepted if max_size() == 0.
!*/
void add (
const T& new_item
);
/*!
ensures
- if (size() < max_size()) then
- #size() == size() + 1
- if (next_add_accepts()) then
- places new_item into *this object at a random location
- else
- randomly does one of the following:
- ignores new_item and makes no change
- replaces a random element of *this with new_item
- if (size() < max_size()) then
- #size() == size() + 1
- #next_add_accepts() == The updated information about the acceptance
of the next call to add()
!*/
void add (
);
/*!
requires
- next_add_accepts() == false
ensures
- This function does nothing but update the value of #next_add_accepts()
!*/
iterator begin(
......
......@@ -60,10 +60,44 @@ namespace
}
}
void test_random_subset_selector2 ()
{
random_subset_selector<double> rand_set;
for (int j = 0; j < 30; ++j)
{
print_spinner();
running_stats<double> rs, rs2;
rand_set.set_max_size(1000);
for (double i = 0; i < 100000; ++i)
{
rs.add(i);
if (rand_set.next_add_accepts())
rand_set.add(i);
else
rand_set.add();
}
for (unsigned long i = 0; i < rand_set.size(); ++i)
rs2.add(rand_set[i]);
dlog << LDEBUG << "true mean: " << rs.mean();
dlog << LDEBUG << "true sampled: " << rs2.mean();
double ratio = rs.mean()/rs2.mean();
DLIB_TEST_MSG(0.96 < ratio && ratio < 1.04, " ratio: " << ratio);
}
}
void perform_test (
)
{
test_random_subset_selector();
test_random_subset_selector2();
}
} a;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment