Added some missing DLIB_ASSERTs to the random_subset_selector. I also gave it

an empty add() and a next_add_accepts() so that the user doesn't have to load data samples that wouldn't get selected for inclusion in the random subset. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403424

Added some missing DLIB_ASSERTs to the random_subset_selector. I also gave it
an empty add() and a next_add_accepts() so that the user doesn't have to load data samples that wouldn't get selected for inclusion in the random subset. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403424
aa74346e · Davis King · ba1d50cc · aa74346e · aa74346e · aa74346e
Commit aa74346e authored Jan 30, 2010 by Davis King
Showing with 161 additions and 26 deletions

random_subset_selector.h dlib/statistics/random_subset_selector.h +81 -20

random_subset_selector_abstract.h dlib/statistics/random_subset_selector_abstract.h +46 -6

statistics.cpp dlib/test/statistics.cpp +34 -0

No files found.
--- a/dlib/statistics/random_subset_selector.h
+++ b/dlib/statistics/random_subset_selector.h
@@ -22,12 +22,14 @@ namespace dlib
                - _max_size == 0
                - items.size() == 0
                - count == 0
+                - _next_add_accepts == false

            CONVENTION
                - count == the number of times add() has been called since the last
                  time this object was empty.
                - items.size() == size()
                - max_size() == _max_size
+                - next_add_accepts() == _next_add_accepts
        !*/
    public:
        typedef T type;
@@ -55,6 +57,7 @@ namespace dlib
        {
            items.resize(0);
            count = 0;
+            update_next_add_accepts();
        }

        unsigned long size (
@@ -70,6 +73,7 @@ namespace dlib
            items.reserve(new_max_size);
            make_empty();
            _max_size = new_max_size;
+            update_next_add_accepts();
        }

        unsigned long max_size (
@@ -82,6 +86,15 @@ namespace dlib
            unsigned long idx
        ) 
        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(idx < size(),
+                "\tvoid random_subset_selector::operator[]()"
+                << "\n\t idx is out of range"
+                << "\n\t idx:    " << idx 
+                << "\n\t size(): " << size() 
+                << "\n\t this:   " << this
+                );
+
            return items[idx];
        }

@@ -89,6 +102,15 @@ namespace dlib
            unsigned long idx
        ) const
        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(idx < size(),
+                "\tvoid random_subset_selector::operator[]()"
+                << "\n\t idx is out of range"
+                << "\n\t idx:    " << idx 
+                << "\n\t size(): " << size() 
+                << "\n\t this:   " << this
+                );
+
            return items[idx];
        }

@@ -97,6 +119,12 @@ namespace dlib
        iterator                end()                           { return items.end(); }
        const_iterator          end() const                     { return items.end(); }

+        bool next_add_accepts (
+        ) const 
+        {
+            return _next_add_accepts;
+        }
+
        void add (
            const T& new_item
        )
@@ -107,14 +135,62 @@ namespace dlib
                // swap into a random place
                exchange(items[rnd.get_random_32bit_number()%items.size()], items.back());
            }
+            else if (_next_add_accepts)
+            {
+                // pick a random element of items and replace it.
+                items[rnd.get_random_32bit_number()%items.size()] = new_item;
+            }
+
+            update_next_add_accepts();
+            ++count;
+        }
+
+        void add (
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(next_add_accepts() == false,
+                "\tvoid random_subset_selector::add()"
+                << "\n\t You should be calling the version of add() that takes an argument"
+                << "\n\t this: " << this
+                );
+
+            update_next_add_accepts();
+            ++count;
+        }
+
+        void swap (
+            random_subset_selector& a
+        )
+        {
+            a.swap(a.items);
+            std::swap(_max_size, a._max_size);
+            std::swap(count, a.count);
+            rnd.swap(a.rnd);
+            std::swap(_next_add_accepts, a._next_add_accepts);
+        }
+
+    private:
+
+        void update_next_add_accepts (
+        )
+        {
+            if (items.size() < _max_size)
+            {
+                _next_add_accepts = true;
+            }
+            else if (_max_size == 0)
+            {
+                _next_add_accepts = false;
+            }
            else
            {
                // At this point each element of items has had an equal chance of being in this object.   
                // In particular, the probability that each arrived here is currently items.size()/count.    
                // We need to be able to say that, after this function ends, the probability of any 
                // particular object ending up in items is items.size()/(count+1).  So this means that 
-                // we should decide to add new_item into items with this probability.  If we do so then 
-                // we pick one of the current items and replace it at random with new_item.
+                // we should decide to add a new item into items with this probability.  Also, if we do 
+                // so then we pick one of the current items and replace it at random with the new item.

                // Make me a random 64 bit number.   This might seem excessive but I want this object
                // to be able to handle an effectively infinite number of calls to add().  So count
@@ -127,34 +203,19 @@ namespace dlib

                num %= (count+1);

-                if (num < items.size())
-                {
-                    // pick a random element of items and replace it.
-                    items[rnd.get_random_32bit_number()%items.size()] = new_item;
-                }
+                _next_add_accepts = (num < items.size());
            }

-            ++count;
        }

-        void swap (
-            random_subset_selector& a
-        )
-        {
-            a.swap(a.items);
-            std::swap(_max_size, a._max_size);
-            std::swap(count, a.count);
-            rnd.swap(a.rnd);
-        }
-
-    private:
-
        std::vector<T> items;
        unsigned long _max_size;
        uint64 count; 

        rand_type rnd;

+        bool _next_add_accepts;
+
    };

    template <

--- a/dlib/statistics/random_subset_selector_abstract.h
+++ b/dlib/statistics/random_subset_selector_abstract.h
@@ -25,6 +25,7 @@ namespace dlib
            INITIAL VALUE
                - size() == 0
                - max_size() == 0
+                - next_add_accepts() == false

            WHAT THIS OBJECT REPRESENTS
                This object is a tool to help you select a random subset of a large body of data.  
@@ -43,6 +44,24 @@ namespace dlib
                random I mean that each of the 1000000 data samples has an equal change of ending
                up in the rand_subset object.

+
+                Note that the above example calls get_next_data_sample() for each data sample.  This 
+                may be inefficient since most of the data samples are just ignored.  An alternative 
+                method that doesn't require you to load each sample can also be used.  Consider the 
+                following:
+
+                    random_subset_selector<sample_type> rand_subset;
+                    rand_subset.set_max_size(1000)
+                    for (int i = 0; i < 1000000; ++i)
+                        if (rand_subset.next_add_accepts())
+                            rand_subset.add(get_data_sample(i));
+                        else
+                            rand_subset.add() 
+
+                In the above example we only actually fetch the data sample into memory if we
+                know that the rand_subset would include it into the random subset.  Otherwise,
+                we can just call the empty add().
+
        !*/
    public:
        typedef T type;
@@ -118,18 +137,39 @@ namespace dlib
                - returns a const reference to the idx'th element of this object
        !*/

+        bool next_add_accepts (
+        ) const;
+        /*!
+            ensures
+                - if (the next call to add(item) will result in item being included
+                  into *this) then
+                    - returns true
+                    - Note that the next item will always be accepted if size() < max_size().
+                - else
+                    - returns false
+                    - Note that the next item will never be accepted if max_size() == 0.
+        !*/
+
        void add (
            const T& new_item
        );
        /*!
            ensures
-                - if (size() < max_size()) then
-                    - #size() == size() + 1
+                - if (next_add_accepts()) then
                    - places new_item into *this object at a random location
-                - else
-                    - randomly does one of the following:
-                        - ignores new_item and makes no change
-                        - replaces a random element of *this with new_item
+                    - if (size() < max_size()) then
+                        - #size() == size() + 1
+                - #next_add_accepts() == The updated information about the acceptance
+                  of the next call to add()
+        !*/
+
+        void add (
+        );
+        /*!
+            requires
+                - next_add_accepts() == false
+            ensures
+                - This function does nothing but update the value of #next_add_accepts()
        !*/

        iterator begin(

--- a/dlib/test/statistics.cpp
+++ b/dlib/test/statistics.cpp
@@ -60,10 +60,44 @@ namespace
            }
        }

+        void test_random_subset_selector2 ()
+        {
+            random_subset_selector<double> rand_set;
+
+            for (int j = 0; j < 30; ++j)
+            {
+                print_spinner();
+
+                running_stats<double> rs, rs2;
+
+                rand_set.set_max_size(1000);
+
+                for (double i = 0; i < 100000; ++i)
+                {
+                    rs.add(i);
+                    if (rand_set.next_add_accepts())
+                        rand_set.add(i);
+                    else
+                        rand_set.add();
+                }
+
+
+                for (unsigned long i = 0; i < rand_set.size(); ++i)
+                    rs2.add(rand_set[i]);
+
+
+                dlog << LDEBUG << "true mean:    " << rs.mean();
+                dlog << LDEBUG << "true sampled: " << rs2.mean();
+                double ratio = rs.mean()/rs2.mean();
+                DLIB_TEST_MSG(0.96 < ratio  && ratio < 1.04, " ratio: " << ratio);
+            }
+        }
+
        void perform_test (
        )
        {
            test_random_subset_selector();
+            test_random_subset_selector2();
        }
    } a;