Commit 86857190 authored by Davis King's avatar Davis King

Filled out spec for the reinforcement learning stuff.

parent eec89793
......@@ -16,6 +16,7 @@ namespace dlib
>
struct process_sample
{
typedef feature_extractor feature_extractor_type;
typedef typename feature_extractor::state_type state_type;
typedef typename feature_extractor::action_type action_type;
......@@ -61,6 +62,7 @@ namespace dlib
{
public:
typedef feature_extractor feature_extractor_type;
typedef typename feature_extractor::state_type state_type;
typedef typename feature_extractor::action_type action_type;
......
......@@ -3,10 +3,211 @@
#undef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
#ifdef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
#include "../matrix.h"
namespace dlib
{
}
// ----------------------------------------------------------------------------------------
struct example_feature_extractor
{
/*!
WHAT THIS OBJECT REPRESENTS
This object defines the interface a feature extractor must implement if it
is to be used with the process_sample and policy objects defined at the
bottom of this file. Moreover, it is meant to represent the core part
of a model use in a reinforcement learning algorithm.
In particular, this object models a Q(state,action) function where
Q(state,action) == dot(w, PSI(state,action))
where PSI(state,action) is a feature vector and w is a parameter
vector.
Therefore, a feature extractor defines how the PSI(x,y) feature vector is
calculated. It also defines the types used to represent the state and
action objects.
THREAD SAFETY
Instances of this object are required to be threadsafe, that is, it should
be safe for multiple threads to make concurrent calls to the member
functions of this object.
!*/
// The state and actions can be any types so long as you provide typedefs for them.
typedef T state_type;
typedef U action_type;
// We can also say that the last element in the weight vector w must be 1. This
// can be useful for including a prior into your model.
const static bool force_last_weight_to_1 = false;
example_feature_extractor(
);
/*!
ensures
- this object is properly initialized.
!*/
unsigned long num_features(
) const;
/*!
ensures
- returns the dimensionality of the PSI() feature vector.
!*/
action_type find_best_action (
const state_type& state,
const matrix<double,0,1>& w
) const;
/*!
ensures
- returns the action A that maximizes Q(state,A) = dot(w,PSI(state,A)).
That is, this function finds the best action to take in the given state
when our model is parameterized by the given weight vector w.
!*/
void get_features (
const state_type& state,
const action_type& action,
matrix<double,0,1>& feats
) const;
/*!
ensures
- #feats.size() == num_features()
- #feats == PSI(state,action)
!*/
};
// ----------------------------------------------------------------------------------------
template <
typename feature_extractor
>
struct process_sample
{
/*!
REQUIREMENTS ON feature_extractor
feature_extractor should implement the example_feature_extractor interface
defined at the top of this file.
WHAT THIS OBJECT REPRESENTS
This object holds a training sample for a reinforcement learning algorithm.
In particular, it should be a sample from some process where the process
was in state this->state, then took this->action action which resulted in
receiving this->reward and ending up in the state this->next_state.
!*/
typedef feature_extractor feature_extractor_type;
typedef typename feature_extractor::state_type state_type;
typedef typename feature_extractor::action_type action_type;
process_sample(){}
process_sample(
const state_type& s,
const action_type& a,
const state_type& n,
const double& r
) : state(s), action(a), next_state(n), reward(r) {}
state_type state;
action_type action;
state_type next_state;
double reward;
};
template < typename feature_extractor >
void serialize (const process_sample<feature_extractor>& item, std::ostream& out);
template < typename feature_extractor >
void deserialize (process_sample<feature_extractor>& item, std::istream& in);
/*!
provides serialization support.
!*/
// ----------------------------------------------------------------------------------------
template <
typename feature_extractor
>
class policy
{
/*!
REQUIREMENTS ON feature_extractor
feature_extractor should implement the example_feature_extractor interface
defined at the top of this file.
WHAT THIS OBJECT REPRESENTS
This is a policy based on the supplied feature_extractor model. In
particular, it maps from feature_extractor::state_type to the best action
to take in that state.
!*/
public:
typedef feature_extractor feature_extractor_type;
typedef typename feature_extractor::state_type state_type;
typedef typename feature_extractor::action_type action_type;
policy (
);
/*!
ensures
- #get_feature_extractor() == feature_extractor()
(i.e. it will have its default value)
- #get_weights().size() == #get_feature_extractor().num_features()
- #get_weights() == 0
!*/
policy (
const matrix<double,0,1>& weights,
const feature_extractor& fe
);
/*!
requires
- fe.num_features() == weights.size()
ensures
- #get_feature_extractor() == fe
- #get_weights() == weights
!*/
action_type operator() (
const state_type& state
) const;
/*!
ensures
- returns get_feature_extractor().find_best_action(state,w);
!*/
const feature_extractor& get_feature_extractor (
) const;
/*!
ensures
- returns the feature extractor used by this object
!*/
const matrix<double,0,1>& get_weights (
) const;
/*!
ensures
- returns the parameter vector (w) associated with this object. The length
of the vector is get_feature_extractor().num_features().
!*/
};
template < typename feature_extractor >
void serialize(const policy<feature_extractor>& item, std::ostream& out);
template < typename feature_extractor >
void deserialize(policy<feature_extractor>& item, std::istream& in);
/*!
provides serialization support.
!*/
// ----------------------------------------------------------------------------------------
#endif // DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
......@@ -17,6 +17,7 @@ namespace dlib
class lspi
{
public:
typedef feature_extractor feature_extractor_type;
typedef typename feature_extractor::state_type state_type;
typedef typename feature_extractor::action_type action_type;
......@@ -40,6 +41,12 @@ namespace dlib
double value
)
{
// make sure requires clause is not broken
DLIB_ASSERT(0 < value && value <= 1,
"\t void lspi::set_discount(value)"
<< "\n\t invalid inputs were given to this function"
<< "\n\t value: " << value
);
discount = value;
}
......@@ -99,25 +106,21 @@ namespace dlib
void set_max_iterations (
unsigned long max_iter
) { max_iterations = max_iter; }
/*!
ensures
- #get_max_iterations() == max_iter
!*/
unsigned long get_max_iterations (
) { return max_iterations; }
/*!
ensures
- returns the maximum number of iterations the SVM optimizer is allowed to
run before it is required to stop and return a result.
!*/
template <typename vector_type>
policy<feature_extractor> train (
//const std::vector<process_sample<feature_extractor> >& samples
const vector_type& samples
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(samples.size() > 0,
"\t policy lspi::train(samples)"
<< "\n\t invalid inputs were given to this function"
);
matrix<double,0,1> w(fe.num_features());
w = 0;
matrix<double,0,1> prev_w, b, f1, f2;
......
......@@ -3,9 +3,189 @@
#undef DLIB_LSPI_ABSTRACT_Hh_
#ifdef DLIB_LSPI_ABSTRACT_Hh_
#include "approximate_linear_models_abstract.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename feature_extractor
>
class lspi
{
/*!
REQUIREMENTS ON feature_extractor
feature_extractor should implement the example_feature_extractor interface
defined at the top of dlib/control/approximate_linear_models_abstract.h
WHAT THIS OBJECT REPRESENTS
This object is an implementation of the reinforcement learning algorithm
described in the following paper:
Lagoudakis, Michail G., and Ronald Parr. "Least-squares policy
iteration." The Journal of Machine Learning Research 4 (2003):
1107-1149.
This means that it takes a bunch of training data in the form of
process_samples and outputs a policy that hopefully performs well when run
on the process that generated those samples.
!*/
public:
typedef feature_extractor feature_extractor_type;
typedef typename feature_extractor::state_type state_type;
typedef typename feature_extractor::action_type action_type;
explicit lspi(
const feature_extractor& fe_
);
/*!
ensures
- #get_feature_extractor() == fe_
- #get_lambda() == 0.01
- #get_discount == 0.8
- #get_epsilon() == 0.01
- is not verbose
- #get_max_iterations() == 100
!*/
lspi(
);
/*!
ensures
- #get_feature_extractor() == feature_extractor()
(i.e. it will have its default value)
- #get_lambda() == 0.01
- #get_discount == 0.8
- #get_epsilon() == 0.01
- is not verbose
- #get_max_iterations() == 100
!*/
double get_discount (
) const;
/*!
ensures
- returns the discount applied to the sum of rewards in the Bellman
equation.
!*/
void set_discount (
double value
);
/*!
requires
- 0 < value <= 1
ensures
- #get_discount() == value
!*/
const feature_extractor& get_feature_extractor (
) const;
/*!
ensures
- returns the feature extractor used by this object
!*/
void be_verbose (
);
/*!
ensures
- This object will print status messages to standard out so that a
user can observe the progress of the algorithm.
!*/
void be_quiet (
);
/*!
ensures
- this object will not print anything to standard out
!*/
void set_epsilon (
double eps
);
/*!
requires
- eps > 0
ensures
- #get_epsilon() == eps
!*/
double get_epsilon (
) const;
/*!
ensures
- returns the error epsilon that determines when training should stop.
Smaller values may result in a more accurate solution but take longer to
train.
!*/
void set_lambda (
double lambda_
);
/*!
requires
- lambda >= 0
ensures
- #get_lambda() == lambda
!*/
double get_lambda (
) const;
/*!
ensures
- returns the regularization parameter. It is the parameter that
determines the trade off between trying to fit the training data
exactly or allowing more errors but hopefully improving the
generalization ability of the resulting function. Smaller values
encourage exact fitting while larger values of lambda may encourage
better generalization.
!*/
void set_max_iterations (
unsigned long max_iter
);
/*!
ensures
- #get_max_iterations() == max_iter
!*/
unsigned long get_max_iterations (
);
/*!
ensures
- returns the maximum number of iterations the SVM optimizer is allowed to
run before it is required to stop and return a result.
!*/
template <
typename vector_type
>
policy<feature_extractor> train (
const vector_type& samples
) const;
/*!
requires
- samples.size() > 0
- samples is something with an interface that looks like
std::vector<process_sample<feature_extractor>>. That is, it should
be some kind of array of process_sample objects.
ensures
- Trains a policy based on the given data and returns the results. The
idea is to find a policy that will obtain the largest possible reward
when run on the process that generated the samples. In particular,
if the returned policy is P then:
- P(S) == the best action to take when in state S.
- if (feature_extractor::force_last_weight_to_1) then
- The last element of P.get_weights() is 1.
!*/
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_LSPI_ABSTRACT_Hh_
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment