Filled out spec for the reinforcement learning stuff.

86857190 · Davis King · eec89793 · 86857190 · 86857190 · 86857190
Commit 86857190 authored Apr 28, 2015 by Davis King
4 changed files
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -16,6 +16,7 @@ namespace dlib
        >
    struct process_sample
    {
+        typedef feature_extractor feature_extractor_type;
        typedef typename feature_extractor::state_type state_type;
        typedef typename feature_extractor::action_type action_type;

@@ -61,6 +62,7 @@ namespace dlib
    {
    public:

+        typedef feature_extractor feature_extractor_type;
        typedef typename feature_extractor::state_type state_type;
        typedef typename feature_extractor::action_type action_type;


--- a/dlib/control/approximate_linear_models_abstract.h
+++ b/dlib/control/approximate_linear_models_abstract.h
@@ -3,10 +3,211 @@
 #undef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
 #ifdef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_

+#include "../matrix.h"

 namespace dlib
 {
-}
+
+// ----------------------------------------------------------------------------------------
+
+    struct example_feature_extractor 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object defines the interface a feature extractor must implement if it
+                is to be used with the process_sample and policy objects defined at the
+                bottom of this file.  Moreover, it is meant to represent the core part
+                of a model use in a reinforcement learning algorithm.
+                
+                In particular, this object models a Q(state,action) function where
+                    Q(state,action) == dot(w, PSI(state,action))
+                    where PSI(state,action) is a feature vector and w is a parameter
+                    vector.
+
+                Therefore, a feature extractor defines how the PSI(x,y) feature vector is
+                calculated.  It also defines the types used to represent the state and
+                action objects. 
+
+
+            THREAD SAFETY
+                Instances of this object are required to be threadsafe, that is, it should
+                be safe for multiple threads to make concurrent calls to the member
+                functions of this object.
+        !*/
+
+        // The state and actions can be any types so long as you provide typedefs for them.
+        typedef T state_type;
+        typedef U action_type; 
+        // We can also say that the last element in the weight vector w must be 1.  This
+        // can be useful for including a prior into your model.
+        const static bool force_last_weight_to_1 = false;
+
+        example_feature_extractor(
+        );
+        /*!
+            ensures
+                - this object is properly initialized.
+        !*/
+
+        unsigned long num_features(
+        ) const;
+        /*!
+            ensures
+                - returns the dimensionality of the PSI() feature vector.  
+        !*/
+
+        action_type find_best_action (
+            const state_type& state,
+            const matrix<double,0,1>& w
+        ) const;
+        /*!
+            ensures
+                - returns the action A that maximizes Q(state,A) = dot(w,PSI(state,A)).
+                  That is, this function finds the best action to take in the given state
+                  when our model is parameterized by the given weight vector w.
+        !*/
+
+        void get_features (
+            const state_type& state,
+            const action_type& action,
+            matrix<double,0,1>& feats
+        ) const;
+        /*!
+            ensures
+                - #feats.size() == num_features()
+                - #feats == PSI(state,action)
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename feature_extractor
+        >
+    struct process_sample
+    {
+        /*!
+            REQUIREMENTS ON feature_extractor
+                feature_extractor should implement the example_feature_extractor interface
+                defined at the top of this file.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object holds a training sample for a reinforcement learning algorithm.
+                In particular, it should be a sample from some process where the process
+                was in state this->state, then took this->action action which resulted in
+                receiving this->reward and ending up in the state this->next_state.
+        !*/
+
+        typedef feature_extractor feature_extractor_type;
+        typedef typename feature_extractor::state_type state_type;
+        typedef typename feature_extractor::action_type action_type;
+
+        process_sample(){}
+
+        process_sample(
+            const state_type& s,
+            const action_type& a,
+            const state_type& n,
+            const double& r
+        ) : state(s), action(a), next_state(n), reward(r) {}
+
+        state_type  state;
+        action_type action;
+        state_type  next_state;
+        double reward;
+    };
+
+    template < typename feature_extractor >
+    void serialize (const process_sample<feature_extractor>& item, std::ostream& out);
+    template < typename feature_extractor >
+    void deserialize (process_sample<feature_extractor>& item, std::istream& in);
+    /*!
+        provides serialization support.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename feature_extractor
+        >
+    class policy
+    {
+        /*!
+            REQUIREMENTS ON feature_extractor
+                feature_extractor should implement the example_feature_extractor interface
+                defined at the top of this file.
+
+            WHAT THIS OBJECT REPRESENTS
+                This is a policy based on the supplied feature_extractor model.  In
+                particular, it maps from feature_extractor::state_type to the best action
+                to take in that state.
+        !*/
+
+    public:
+
+        typedef feature_extractor feature_extractor_type;
+        typedef typename feature_extractor::state_type state_type;
+        typedef typename feature_extractor::action_type action_type;
+
+
+        policy (
+        );
+        /*!
+            ensures
+                - #get_feature_extractor() == feature_extractor() 
+                  (i.e. it will have its default value)
+                - #get_weights().size() == #get_feature_extractor().num_features()
+                - #get_weights() == 0
+        !*/
+
+        policy (
+            const matrix<double,0,1>& weights,
+            const feature_extractor& fe
+        ); 
+        /*!
+            requires
+                - fe.num_features() == weights.size()
+            ensures
+                - #get_feature_extractor() == fe
+                - #get_weights() == weights
+        !*/
+
+        action_type operator() (
+            const state_type& state
+        ) const;
+        /*!
+            ensures
+                - returns get_feature_extractor().find_best_action(state,w);
+        !*/
+
+        const feature_extractor& get_feature_extractor (
+        ) const; 
+        /*!
+            ensures
+                - returns the feature extractor used by this object
+        !*/
+
+        const matrix<double,0,1>& get_weights (
+        ) const; 
+        /*!
+            ensures
+                - returns the parameter vector (w) associated with this object.  The length
+                  of the vector is get_feature_extractor().num_features().  
+        !*/
+
+    };
+
+    template < typename feature_extractor >
+    void serialize(const policy<feature_extractor>& item, std::ostream& out);
+    template < typename feature_extractor >
+    void deserialize(policy<feature_extractor>& item, std::istream& in);
+    /*!
+        provides serialization support.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+

 #endif // DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
 
--- a/dlib/control/lspi.h
+++ b/dlib/control/lspi.h
@@ -17,6 +17,7 @@ namespace dlib
    class lspi
    {
    public:
+        typedef feature_extractor feature_extractor_type;
        typedef typename feature_extractor::state_type state_type;
        typedef typename feature_extractor::action_type action_type;

@@ -40,6 +41,12 @@ namespace dlib
            double value
        )
        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(0 < value && value <= 1,
+                "\t void lspi::set_discount(value)"
+                << "\n\t invalid inputs were given to this function"
+                << "\n\t value: " << value 
+                );
            discount = value;
        }

@@ -99,25 +106,21 @@ namespace dlib
        void set_max_iterations (
            unsigned long max_iter
        ) { max_iterations = max_iter; }
-        /*!
-            ensures
-                - #get_max_iterations() == max_iter
-        !*/

        unsigned long get_max_iterations (
        ) { return max_iterations; }
-        /*!
-            ensures
-                - returns the maximum number of iterations the SVM optimizer is allowed to
-                  run before it is required to stop and return a result.
-        !*/

        template <typename vector_type>
        policy<feature_extractor> train (
-            //const std::vector<process_sample<feature_extractor> >& samples
            const vector_type& samples
        ) const
        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(samples.size() > 0,
+                "\t policy lspi::train(samples)"
+                << "\n\t invalid inputs were given to this function"
+                );
+
            matrix<double,0,1> w(fe.num_features());
            w = 0;
            matrix<double,0,1> prev_w, b, f1, f2;

--- a/dlib/control/lspi_abstract.h
+++ b/dlib/control/lspi_abstract.h
@@ -3,9 +3,189 @@
 #undef DLIB_LSPI_ABSTRACT_Hh_
 #ifdef DLIB_LSPI_ABSTRACT_Hh_

+#include "approximate_linear_models_abstract.h"

 namespace dlib
 {
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename feature_extractor
+        >
+    class lspi
+    {
+        /*!
+            REQUIREMENTS ON feature_extractor
+                feature_extractor should implement the example_feature_extractor interface
+                defined at the top of dlib/control/approximate_linear_models_abstract.h
+
+            WHAT THIS OBJECT REPRESENTS
+                This object is an implementation of the reinforcement learning algorithm
+                described in the following paper:
+                    Lagoudakis, Michail G., and Ronald Parr. "Least-squares policy
+                    iteration." The Journal of Machine Learning Research 4 (2003):
+                    1107-1149.
+                
+                This means that it takes a bunch of training data in the form of
+                process_samples and outputs a policy that hopefully performs well when run
+                on the process that generated those samples.
+        !*/
+
+    public:
+        typedef feature_extractor feature_extractor_type;
+        typedef typename feature_extractor::state_type state_type;
+        typedef typename feature_extractor::action_type action_type;
+
+        explicit lspi(
+            const feature_extractor& fe_
+        ); 
+        /*!
+            ensures
+                - #get_feature_extractor() == fe_
+                - #get_lambda() == 0.01
+                - #get_discount == 0.8
+                - #get_epsilon() == 0.01
+                - is not verbose
+                - #get_max_iterations() == 100
+        !*/
+
+        lspi(
+        );
+        /*!
+            ensures
+                - #get_feature_extractor() == feature_extractor() 
+                  (i.e. it will have its default value)
+                - #get_lambda() == 0.01
+                - #get_discount == 0.8
+                - #get_epsilon() == 0.01
+                - is not verbose
+                - #get_max_iterations() == 100
+        !*/
+
+        double get_discount (
+        ) const;
+        /*!
+            ensures
+                - returns the discount applied to the sum of rewards in the Bellman
+                  equation.
+        !*/
+
+        void set_discount (
+            double value
+        );
+        /*!
+            requires
+                - 0 < value <= 1
+            ensures
+                - #get_discount() == value
+        !*/
+
+        const feature_extractor& get_feature_extractor (
+        ) const;
+        /*!
+            ensures
+                - returns the feature extractor used by this object
+        !*/
+
+        void be_verbose (
+        );
+        /*!
+            ensures
+                - This object will print status messages to standard out so that a 
+                  user can observe the progress of the algorithm.
+        !*/
+
+        void be_quiet (
+        );
+        /*!
+            ensures
+                - this object will not print anything to standard out
+        !*/
+
+        void set_epsilon (
+            double eps
+        );
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_epsilon() == eps
+        !*/
+
+        double get_epsilon (
+        ) const;
+        /*!
+            ensures
+                - returns the error epsilon that determines when training should stop.
+                  Smaller values may result in a more accurate solution but take longer to
+                  train.  
+        !*/
+
+        void set_lambda (
+            double lambda_ 
+        ); 
+        /*!
+            requires
+                - lambda >= 0
+            ensures
+                - #get_lambda() == lambda 
+        !*/
+
+        double get_lambda (
+        ) const;
+        /*!
+            ensures
+                - returns the regularization parameter.  It is the parameter that 
+                  determines the trade off between trying to fit the training data 
+                  exactly or allowing more errors but hopefully improving the 
+                  generalization ability of the resulting function.  Smaller values 
+                  encourage exact fitting while larger values of lambda may encourage 
+                  better generalization. 
+        !*/
+
+        void set_max_iterations (
+            unsigned long max_iter
+        ); 
+        /*!
+            ensures
+                - #get_max_iterations() == max_iter
+        !*/
+
+        unsigned long get_max_iterations (
+        ); 
+        /*!
+            ensures
+                - returns the maximum number of iterations the SVM optimizer is allowed to
+                  run before it is required to stop and return a result.
+        !*/
+
+        template <
+            typename vector_type
+            >
+        policy<feature_extractor> train (
+            const vector_type& samples
+        ) const;
+        /*!
+            requires
+                - samples.size() > 0
+                - samples is something with an interface that looks like 
+                  std::vector<process_sample<feature_extractor>>.  That is, it should
+                  be some kind of array of process_sample objects.
+            ensures
+                - Trains a policy based on the given data and returns the results.  The
+                  idea is to find a policy that will obtain the largest possible reward
+                  when run on the process that generated the samples.  In particular, 
+                  if the returned policy is P then:
+                    - P(S) == the best action to take when in state S.
+                    - if (feature_extractor::force_last_weight_to_1) then
+                        - The last element of P.get_weights() is 1. 
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
 }

 #endif // DLIB_LSPI_ABSTRACT_Hh_