diff --git a/dlib/dnn.h b/dlib/dnn.h new file mode 100644 index 0000000000000000000000000000000000000000..19204367e46d03f03bccb0b4dd024c72c766ee96 --- /dev/null +++ b/dlib/dnn.h @@ -0,0 +1,15 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_ +#define DLIB_DNn_ + +#include "dnn/tensor.h" +#include "dnn/input.h" +#include "dnn/layers.h" +#include "dnn/loss.h" +#include "dnn/core.h" +#include "dnn/solvers.h" + +#endif // DLIB_DNn_ + + diff --git a/dlib/dnn/core.h b/dlib/dnn/core.h new file mode 100644 index 0000000000000000000000000000000000000000..007042d03c0fffc01319d27d355c3c959fceec3d --- /dev/null +++ b/dlib/dnn/core.h @@ -0,0 +1,1405 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_CORE_H_ +#define DLIB_DNn_CORE_H_ + +#include "core_abstract.h" +#include "tensor.h" +#include "solvers.h" +#include <iterator> +#include <memory> +#include <type_traits> +#include <dlib/statistics.h> +#include <dlib/rand.h> +#include <utility> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + // Tell us if T is one of the special layer types (i.e. add_layer, add_loss, add_tag, + // or add_skip). + template <typename T> struct is_layer_type : std::false_type {}; + template <typename T> struct is_loss_layer_type : std::false_type {}; + +// ---------------------------------------------------------------------------------------- + + inline void randomize_parameters ( + tensor& params, + unsigned long num_inputs_and_outputs, + dlib::rand& rnd + ) + { + float* data = params.host(); + for (size_t i = 0; i < params.size(); ++i) + { + // Draw a random number to initialize the layer according to formula (16) + // from Understanding the difficulty of training deep feedforward neural + // networks by Xavier Glorot and Yoshua Bengio. + float val = 2*rnd.get_random_float()-1; + val *= std::sqrt(6.0/(num_inputs_and_outputs)); + + data[i] = val; + } + } + +// ---------------------------------------------------------------------------------------- + + template <typename T, size_t N> + class sstack + { + public: + static_assert(N > 0, "You can't create an empty sstack."); + typedef T value_type; + const static size_t num_elements = N; + + sstack() {} + sstack(const T& item_) : item(item_), data(item_) {} + + const T& top() const { return item; } + T& top() { return item; } + + size_t size() const { return N; } + + const sstack<T,N-1>& pop() const { return data; } + sstack<T,N-1>& pop() { return data; } + + private: + T item; + sstack<T,N-1> data; + }; + + template <typename T> + class sstack<T,1> // base case of recursive definition. + { + public: + sstack() {} + explicit sstack(const T& item_) : item(item_) {} + + const T& top() const { return item; } + T& top() { return item; } + + size_t size() const { return 1; } + private: + T item; + }; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace dimpl + { + template <typename T, typename enabled=void> + class sub_net_wrapper + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a tool that makes an add_layer or add_loss object + expose only the part of its interface defined by the SUB_NET + type in layers_abstract.h. This way, when we pass sub network + objects to the layer callbacks those callbacks won't be able to + interact with the sub networks in a way other than specified + by the SUB_NET interface spec. + !*/ + + public: + sub_net_wrapper(T& l_) {} + // Nothing here because in this case T is one of the input layer types + // that doesn't have anything in it. + }; + + template <typename T> + class sub_net_wrapper<T,typename std::enable_if<is_layer_type<T>::value>::type> + { + + public: + typedef T wrapped_type; + const static size_t num_layers = T::num_layers; + + sub_net_wrapper(T& l_) : l(l_),sub(l.sub_net()) {} + + const tensor& get_output() const { return l.get_output(); } + tensor& get_gradient_input() { return l.get_gradient_input(); } + + const sub_net_wrapper<typename T::sub_net_type>& sub_net() const { sub; } + sub_net_wrapper<typename T::sub_net_type>& sub_net() { sub; } + + private: + T& l; + sub_net_wrapper<typename T::sub_net_type> sub; + }; + } + + template <typename LAYER_DETAILS, typename SUB_NET, typename enabled = void> + class add_layer; + + template <typename T, typename U> + struct is_layer_type<add_layer<T,U>> : std::true_type {}; + + template <typename LAYER_DETAILS, typename SUB_NET> + class add_layer<LAYER_DETAILS,SUB_NET, + typename std::enable_if<is_layer_type<SUB_NET>::value>::type> + { + public: + typedef LAYER_DETAILS layer_details_type; + typedef SUB_NET sub_net_type; + typedef typename sub_net_type::input_type input_type; + const static size_t num_layers = sub_net_type::num_layers + 1; + const static unsigned int sample_expansion_factor = sub_net_type::sample_expansion_factor; + + add_layer( + ): + this_layer_setup_called(false), + gradient_input_is_stale(true) + { + } + + add_layer(const add_layer&) = default; + add_layer(add_layer&&) = default; + add_layer& operator=(add_layer&&) = default; + add_layer& operator=(const add_layer&) = default; + + template <typename T, typename U, typename E> + friend class add_layer; + + // Allow copying networks from one to another as long as their corresponding + // layers can be constructed from each other. + template <typename T, typename U, typename E> + add_layer( + const add_layer<T,U,E>& item + ) : + sub_network(item.sub_net()), + details(item.layer_details()), + this_layer_setup_called(item.this_layer_setup_called), + gradient_input_is_stale(item.gradient_input_is_stale), + x_grad(item.x_grad), + cached_output(item.cached_output) + { + } + + template <typename ...T> + add_layer( + const LAYER_DETAILS& layer_det, + T&& ...args + ) : + details(layer_det), + sub_network(std::forward<T>(args)...), + this_layer_setup_called(false), + gradient_input_is_stale(true) + { + } + + template <typename ...T> + add_layer( + LAYER_DETAILS&& layer_det, + T&& ...args + ) : + details(std::move(layer_det)), + sub_network(std::forward<T>(args)...), + this_layer_setup_called(false), + gradient_input_is_stale(true) + { + } + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const + { + sub_network.to_tensor(begin,end,data); + } + + template <typename input_iterator> + const tensor& operator() ( + input_iterator ibegin, + input_iterator iend + ) + /*! + ensures + - runs [ibegin,iend) through the network and returns the results + !*/ + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + + const tensor& operator() (const input_type& x) + /*! + ensures + - runs a single x through the network and returns the output. + !*/ + { + return (*this)(&x, &x+1); + } + + const tensor& forward(const tensor& x) + { + sub_network.forward(x); + const dimpl::sub_net_wrapper<sub_net_type> wsub(sub_network); + if (!this_layer_setup_called) + { + details.setup(wsub); + this_layer_setup_called = true; + } + details.forward(wsub, cached_output); + gradient_input_is_stale = true; + return get_output(); + } + + const tensor& get_output() const { return cached_output; } + tensor& get_gradient_input() + { + if (gradient_input_is_stale) + { + gradient_input_is_stale = false; + x_grad.copy_size(get_output()); + x_grad = 0; + } + return x_grad; + } + + template <typename solver_type> + void update(const tensor& x, sstack<solver_type,num_layers>& solvers) + /*! + requires + - forward(x) was called to forward propagate x though the network. + - x.num_samples() == get_gradient_input().num_samples() + - get_gradient_input() == the gradient of the network with respect + to some loss. + !*/ + { + dimpl::sub_net_wrapper<sub_net_type> wsub(sub_network); + params_grad.copy_size(details.get_layer_params()); + params_grad = 0; + details.backward(get_gradient_input(), wsub, static_cast<tensor&>(params_grad)); + // Don't try to adjust the parameters if this layer doesn't have any. + if (params_grad.size() != 0) + solvers.top()(details, static_cast<const tensor&>(params_grad)); + sub_network.update(x, solvers.pop()); + } + + const sub_net_type& sub_net() const { return sub_network; } + sub_net_type& sub_net() { return sub_network; } + + const layer_details_type& layer_details() const { return details; } + layer_details_type& layer_details() { return details; } + + void clean() + { + x_grad.clear(); + cached_output.clear(); + params_grad.clear(); + temp_tensor.clear(); + gradient_input_is_stale = true; + sub_network.clean(); + } + + private: + + + sub_net_type sub_network; + LAYER_DETAILS details; + bool this_layer_setup_called; + bool gradient_input_is_stale; + resizable_tensor x_grad; + resizable_tensor cached_output; + + // The following 2 objects don't logically contribute to the state of this class. + // They are only here to prevent them from being reallocated over and over in + // member functions. + resizable_tensor params_grad; + resizable_tensor temp_tensor; + + }; + +// ---------------------------------------------------------------------------------------- + + template <typename LAYER_DETAILS, typename INPUT_LAYER, typename enabled> + class add_layer + { + public: + typedef LAYER_DETAILS layer_details_type; + typedef INPUT_LAYER sub_net_type; + typedef typename INPUT_LAYER::input_type input_type; + const static unsigned int sample_expansion_factor = INPUT_LAYER::sample_expansion_factor; + const static size_t num_layers = 1; + static_assert(sample_expansion_factor >= 1, + "The input layer can't produce fewer output tensors than there are inputs."); + + add_layer( + ): + this_layer_setup_called(false), + gradient_input_is_stale(true) + {} + + add_layer(const add_layer&) = default; + add_layer(add_layer&&) = default; + add_layer& operator=(add_layer&&) = default; + add_layer& operator=(const add_layer&) = default; + + template <typename T, typename U, typename E> + friend class add_layer; + + // Allow copying networks from one to another as long as their corresponding + // layers can be constructed from each other. + template <typename T, typename U, typename E> + add_layer( + const add_layer<T,U,E>& item + ): + input_layer(item.sub_net()), + details(item.layer_details()), + this_layer_setup_called(item.this_layer_setup_called), + gradient_input_is_stale(item.gradient_input_is_stale), + x_grad(item.x_grad), + cached_output(item.cached_output) + { + } + + add_layer( + const LAYER_DETAILS& layer_det + ) : + details(layer_det), + this_layer_setup_called(false), + gradient_input_is_stale(true) + {} + + add_layer( + LAYER_DETAILS&& layer_det + ) : + details(std::move(layer_det)), + this_layer_setup_called(false), + gradient_input_is_stale(true) + {} + + add_layer( + LAYER_DETAILS layer_det, + INPUT_LAYER il + ) : + details(layer_det), + input_layer(il), + this_layer_setup_called(false), + gradient_input_is_stale(true) + {} + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const + { + input_layer.to_tensor(begin, end, data); + // make sure the input layer's to_tensor() function is implemented properly. + DLIB_CASSERT(std::distance(begin,end)*sample_expansion_factor == data.num_samples(),""); + data.async_copy_to_device(); + } + + + template <typename input_iterator> + const tensor& operator() ( + input_iterator ibegin, + input_iterator iend + ) + /*! + ensures + - runs [ibegin,iend) through the network and returns the results + !*/ + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + + const tensor& operator() (const input_type& x) + /*! + ensures + - runs a single x through the network and returns the output. + !*/ + { + return (*this)(&x, &x+1); + } + + const tensor& forward (const tensor& x) + /*! + requires + - x.num_samples() is a multiple of sample_expansion_factor. + !*/ + { + DLIB_CASSERT(x.num_samples()%sample_expansion_factor == 0,""); + sub_net_wrapper wsub(x, grad_final_ignored); + if (!this_layer_setup_called) + { + details.setup(wsub); + this_layer_setup_called = true; + } + details.forward(wsub, cached_output); + gradient_input_is_stale = true; + return get_output(); + } + + const tensor& get_output() const { return cached_output; } + tensor& get_gradient_input() + { + if (gradient_input_is_stale) + { + gradient_input_is_stale = false; + x_grad.copy_size(get_output()); + x_grad = 0; + } + return x_grad; + } + + + template <typename solver_type> + void update(const tensor& x, sstack<solver_type,num_layers>& solvers) + /*! + requires + - x.num_samples() is a multiple of sample_expansion_factor. + - forward(x) was called to forward propagate x though the network. + - x.num_samples() == get_gradient_input().num_samples() + !*/ + { + sub_net_wrapper wsub(x, grad_final_ignored); + params_grad.copy_size(details.get_layer_params()); + params_grad = 0; + details.backward(get_gradient_input(), wsub, static_cast<tensor&>(params_grad)); + // Don't try to adjust the parameters if this layer doesn't have any. + if (params_grad.size() != 0) + solvers.top()(details, static_cast<const tensor&>(params_grad)); + } + + const sub_net_type& sub_net() const { return input_layer; } + sub_net_type& sub_net() { return input_layer; } + + const layer_details_type& layer_details() const { return details; } + layer_details_type& layer_details() { return details; } + + void clean() + { + x_grad.clear(); + grad_final_ignored.clear(); + cached_output.clear(); + params_grad.clear(); + temp_tensor.clear(); + gradient_input_is_stale = true; + } + + private: + + class sub_net_wrapper + { + public: + sub_net_wrapper(const tensor& x_, resizable_tensor& grad_final_ignored_) : + x(x_), grad_final_ignored(grad_final_ignored_) {} + + const tensor& get_output() const { return x; } + tensor& get_gradient_input() + { + // It doesn't matter what values are in this tensor but client code will + // always assume it's the same dimension as the output so make sure that is + // the case. + grad_final_ignored.copy_size(x); + return grad_final_ignored; + } + + private: + const tensor& x; + resizable_tensor& grad_final_ignored; + }; + + sub_net_type input_layer; + LAYER_DETAILS details; + bool this_layer_setup_called; + bool gradient_input_is_stale; + resizable_tensor x_grad; + resizable_tensor cached_output; + + // The following 3 objects don't logically contribute to the state of this class. + // They are only here to prevent them from being reallocated over and over in + // member functions. + resizable_tensor params_grad; + resizable_tensor temp_tensor; + resizable_tensor grad_final_ignored; + }; + +// ---------------------------------------------------------------------------------------- + + template <unsigned long ID, typename SUB_NET> + class add_tag + { + public: + typedef SUB_NET sub_net_type; + typedef typename sub_net_type::input_type input_type; + const static size_t num_layers = sub_net_type::num_layers + 1; + const static unsigned int sample_expansion_factor = sub_net_type::sample_expansion_factor; + static_assert(sample_expansion_factor >= 1, + "The input layer can't produce fewer output tensors than there are inputs."); + + add_tag() = default; + add_tag(const add_tag&) = default; + add_tag(add_tag&&) = default; + add_tag& operator=(add_tag&&) = default; + add_tag& operator=(const add_tag&) = default; + + template <typename T> + add_tag( + const add_tag<ID,T>& item + ) : sub_network(item.sub_net()) + {} + + template <typename ...T> + add_tag( + T ...args + ) : + sub_network(std::move(args)...) + { + } + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const + { + sub_network.to_tensor(begin,end,data); + } + + template <typename input_iterator> + const tensor& operator() ( + input_iterator ibegin, + input_iterator iend + ) + { + return sub_network(ibegin,iend); + } + + const tensor& operator() (const input_type& x) + { + return sub_network(x); + } + + const tensor& forward(const tensor& x) + { + return sub_network.forward(x); + } + + const tensor& get_output() const { return sub_network.get_output(); } + + tensor& get_gradient_input() + { + return sub_network.get_gradient_input(); + } + + template <typename solver_type> + void update(const tensor& x, sstack<solver_type,num_layers>& solvers) + { + sub_network.update(x,solvers.pop()); + } + + const sub_net_type& sub_net() const { return sub_network; } + sub_net_type& sub_net() { return sub_network; } + + void clean() + { + sub_network.clean(); + } + + private: + + sub_net_type sub_network; + }; + + template <unsigned long ID, typename U> + struct is_layer_type<add_tag<ID,U>> : std::true_type {}; + + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template <typename LOSS_DETAILS, typename SUB_NET> + class add_loss; + + class no_label_type + { + private: + // We don't want anyone making these no_label_type objects. They are here only to + // allow add_loss::label_type and dnn_trainer::label_type to exist which voids + // needing to overload add_loss and dnn_trainer for supervised an unsupervised + // losses. It also can be a type to use in template metaprogramming to indicate + // "no label". So here we make the constructor private with the exception that + // add_loss objects can make it (again, just to simplify add_loss's + // implementation). + no_label_type()=default; + template <typename LOSS_DETAILS, typename SUB_NET> friend class add_loss; + }; + +// ---------------------------------------------------------------------------------------- + + template <typename LOSS_DETAILS, typename SUB_NET> + class add_loss + { + template <typename T, typename enabled=void> + struct get_loss_layer_label_type + { + typedef no_label_type type; + }; + template <typename T> + struct get_loss_layer_label_type<T,typename std::enable_if<sizeof(typename T::label_type)!=0>::type> + { + typedef typename T::label_type type; + }; + + public: + typedef LOSS_DETAILS loss_details_type; + typedef SUB_NET sub_net_type; + typedef typename sub_net_type::input_type input_type; + // Note that the loss layer doesn't count as an additional layer. + const static size_t num_layers = sub_net_type::num_layers; + const static unsigned int sample_expansion_factor = sub_net_type::sample_expansion_factor; + typedef typename get_loss_layer_label_type<LOSS_DETAILS>::type label_type; + + static_assert(is_layer_type<SUB_NET>::value, "SUB_NET must be of type add_layer, add_skip, or add_tag."); + static_assert(sample_expansion_factor == LOSS_DETAILS::sample_expansion_factor, + "The loss layer and input layer must agree on the sample_expansion_factor."); + + + add_loss() = default; + add_loss(const add_loss&) = default; + add_loss(add_loss&&) = default; + add_loss& operator=(add_loss&&) = default; + add_loss& operator=(const add_loss&) = default; + + template <typename T, typename U> + add_loss( + const add_loss<T,U>& item + ) : + loss(item.loss_details()), + sub(item.sub_net()) + {} + + template <typename ...T> + add_loss( + const LOSS_DETAILS& layer_det, + T&& ...args + ) : + loss(layer_det), + sub(std::forward<T>(args)...) + { + } + + template <typename ...T> + add_loss( + LOSS_DETAILS&& layer_det, + T&& ...args + ) : + loss(std::move(layer_det)), + sub(std::forward<T>(args)...) + { + } + + template <typename ...T> + add_loss( + T ...args + ) : + sub(std::move(args)...) + { + } + + template <typename input_iterator, typename output_iterator> + void operator() ( + input_iterator ibegin, + input_iterator iend, + output_iterator obegin + ) + /*! + requires + - obegin == iterator pointing to the start of a range of distance(ibegin,iend) + elements. + ensures + - runs [ibegin,iend) through the network and writes the output to the range at obegin. + !*/ + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + loss.to_label(sub, obegin); + } + + + const label_type& operator() (const input_type& x) + /*! + ensures + - runs a single x through the network and returns the output. + !*/ + { + (*this)(&x, &x+1, &temp_label); + return temp_label; + } + + + template <typename input_iterator, typename label_iterator> + double compute_loss ( + input_iterator ibegin, + input_iterator iend, + label_iterator lbegin + ) + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + dimpl::sub_net_wrapper<sub_net_type> wsub(sub); + return loss.compute_loss(temp_tensor, lbegin, wsub); + } + + template <typename input_iterator> + double compute_loss ( + input_iterator ibegin, + input_iterator iend + ) + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + dimpl::sub_net_wrapper<sub_net_type> wsub(sub); + return loss.compute_loss(temp_tensor, wsub); + } + + template <typename input_iterator, typename label_iterator, typename solver_type> + double update ( + input_iterator ibegin, + input_iterator iend, + label_iterator lbegin, + sstack<solver_type,num_layers>& solvers + ) + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + dimpl::sub_net_wrapper<sub_net_type> wsub(sub); + double l = loss.compute_loss(temp_tensor, lbegin, wsub); + sub.update(temp_tensor, solvers); + return l; + } + + template <typename input_iterator, typename solver_type> + double update ( + input_iterator ibegin, + input_iterator iend, + sstack<solver_type,num_layers>& solvers + ) + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + dimpl::sub_net_wrapper<sub_net_type> wsub(sub); + double l = loss.compute_loss(temp_tensor, wsub); + sub.update(temp_tensor, solvers); + return l; + } + + const sub_net_type& sub_net() const { return sub; } + sub_net_type& sub_net() { return sub; } + const loss_details_type& loss_details() const { return loss; } + loss_details_type& loss_details() { return loss; } + + void clean ( + ) + /*! + ensures + - Causes the network to forget about everything but its parameters. + That is, for each layer we will have: + - get_output().num_samples() == 0 + - get_gradient_input().num_samples() == 0 + However, running new input data though this network will still have the + same output it would have had regardless of any calls to clean(). + Finally, the purpose of clean() is to compact the network object prior to + saving it to disk so that it takes up less space and the IO is quicker. + !*/ + { + temp_tensor.clear(); + sub.clear(); + } + + private: + + loss_details_type loss; + sub_net_type sub; + + // These two objects don't logically contribute to the state of this object. They + // are here to prevent them from being reallocated over and over. + label_type temp_label; + resizable_tensor temp_tensor; + }; + + + template <typename T, typename U> + struct is_loss_layer_type<add_loss<T,U>> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <unsigned int i, typename T> + struct layer_helper + { + static T& makeT(); + using next_type = typename std::remove_reference<decltype(makeT().sub_net())>::type; + using type = typename layer_helper<i-1,next_type>::type; + static type& layer(T& n) + { + return layer_helper<i-1,next_type>::layer(n.sub_net()); + } + }; + template <typename T> + struct layer_helper<0,T> + { + using type = T; + static type& layer(T& n) + { + return n; + } + }; + + template <template<typename> class Match, typename T, unsigned int i, typename enabled = void> + struct layer_helper_match + { + static T& makeT(); + using next_type = typename std::remove_reference<decltype(makeT().sub_net())>::type; + using type = typename layer_helper_match<Match,next_type,i>::type; + static type& layer(T& n) + { + return layer_helper_match<Match,next_type,i>::layer(n.sub_net()); + } + }; + // This overload catches add_layer and add_loss templates. + template <template<typename> class Match, typename T, unsigned int i> + struct layer_helper_match<Match,T,i, + typename std::enable_if<std::is_same<const T,const Match<typename T::sub_net_type>>::value>::type> + { + using type = typename layer_helper<i,T>::type; + static type& layer(T& n) + { + return layer_helper<i,T>::layer(n); + } + }; + // This overload catches input templates. + template <template<typename> class Match, typename T, unsigned int i> + struct layer_helper_match<Match,T,i, + typename std::enable_if<std::is_same<const T,const Match<typename T::input_type>>::value>::type> + { + using type = typename layer_helper<i,T>::type; + static type& layer(T& n) + { + return layer_helper<i,T>::layer(n); + } + }; + // This overload catches sub_net_wrapper templates. + template <template<typename> class Match, typename T, unsigned int i> + struct layer_helper_match<Match,T,i, + typename std::enable_if<std::is_same<const typename T::wrapped_type, + const Match<typename T::wrapped_type::sub_net_type>>::value>::type> + { + using type = typename layer_helper<i,T>::type; + static type& layer(T& n) + { + return layer_helper<i,T>::layer(n); + } + }; + } + + template <unsigned int i, typename T> + typename impl::layer_helper<i,T>::type& layer (T& n) + { + return impl::layer_helper<i,T>::layer(n); + } + + template <template<typename> class Match, typename T> + typename impl::layer_helper_match<Match,T,0>::type& layer (T& n) + { + return impl::layer_helper_match<Match,T,0>::layer(n); + } + + template <template<typename> class Match, unsigned int i, typename T> + typename impl::layer_helper_match<Match,T,i>::type& layer (T& n) + { + return impl::layer_helper_match<Match,T,i>::layer(n); + } + +// ---------------------------------------------------------------------------------------- + + template <template<typename> class TAG_TYPE, typename SUB_NET> + class add_skip + { + /*! + WHAT THIS OBJECT REPRESENTS + This object draws its inputs from layer<TAG_TYPE>(SUB_NET()) + and performs the identity transform. + !*/ + public: + typedef SUB_NET sub_net_type; + typedef typename sub_net_type::input_type input_type; + const static size_t num_layers = sub_net_type::num_layers + 1; + const static unsigned int sample_expansion_factor = sub_net_type::sample_expansion_factor; + static_assert(sample_expansion_factor >= 1, + "The input layer can't produce fewer output tensors than there are inputs."); + + add_skip() = default; + add_skip(const add_skip&) = default; + add_skip(add_skip&&) = default; + add_skip& operator=(add_skip&&) = default; + add_skip& operator=(const add_skip&) = default; + + template <typename T> + add_skip( + const add_skip<TAG_TYPE,T>& item + ) : sub_network(item.sub_net()) + {} + + template <typename ...T> + add_skip( + T ...args + ) : + sub_network(std::move(args)...) + { + } + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const + { + sub_network.to_tensor(begin,end,data); + } + + template <typename input_iterator> + const tensor& operator() ( + input_iterator ibegin, + input_iterator iend + ) + { + sub_network(ibegin,iend); + return layer<TAG_TYPE>(sub_network).get_output(); + } + + const tensor& operator() (const input_type& x) + { + sub_network(x); + return layer<TAG_TYPE>(sub_network).get_output(); + } + + const tensor& forward(const tensor& x) + { + sub_network.forward(x); + return layer<TAG_TYPE>(sub_network).get_output(); + } + + const tensor& get_output() const + { + return layer<TAG_TYPE>(sub_network).get_output(); + } + + tensor& get_gradient_input() + { + return layer<TAG_TYPE>(sub_network).get_gradient_input(); + } + + template <typename solver_type> + void update(const tensor& x, sstack<solver_type,num_layers>& solvers) + { + sub_network.update(x,solvers.pop()); + } + + const sub_net_type& sub_net() const + { + return sub_network; + } + + sub_net_type& sub_net() + { + return sub_network; + } + + void clean() + { + sub_network.clean(); + } + + private: + + sub_net_type sub_network; + }; + template <template<typename> class T, typename U> + struct is_layer_type<add_skip<T,U>> : std::true_type {}; + + template <typename SUB_NET> using tag1 = add_tag< 1, SUB_NET>; + template <typename SUB_NET> using tag2 = add_tag< 2, SUB_NET>; + template <typename SUB_NET> using tag3 = add_tag< 3, SUB_NET>; + template <typename SUB_NET> using tag4 = add_tag< 4, SUB_NET>; + template <typename SUB_NET> using tag5 = add_tag< 5, SUB_NET>; + template <typename SUB_NET> using tag6 = add_tag< 6, SUB_NET>; + template <typename SUB_NET> using tag7 = add_tag< 7, SUB_NET>; + template <typename SUB_NET> using tag8 = add_tag< 8, SUB_NET>; + template <typename SUB_NET> using tag9 = add_tag< 9, SUB_NET>; + template <typename SUB_NET> using tag10 = add_tag<10, SUB_NET>; + + template <typename SUB_NET> using skip1 = add_skip< tag1, SUB_NET>; + template <typename SUB_NET> using skip2 = add_skip< tag2, SUB_NET>; + template <typename SUB_NET> using skip3 = add_skip< tag3, SUB_NET>; + template <typename SUB_NET> using skip4 = add_skip< tag4, SUB_NET>; + template <typename SUB_NET> using skip5 = add_skip< tag5, SUB_NET>; + template <typename SUB_NET> using skip6 = add_skip< tag6, SUB_NET>; + template <typename SUB_NET> using skip7 = add_skip< tag7, SUB_NET>; + template <typename SUB_NET> using skip8 = add_skip< tag8, SUB_NET>; + template <typename SUB_NET> using skip9 = add_skip< tag9, SUB_NET>; + template <typename SUB_NET> using skip10 = add_skip<tag10, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + namespace timpl + { + void fill_with_gassuan_random_numbers ( + tensor& t, + dlib::rand& rnd, + double sigma = 1 + ) + { + float* data = t.host(); + for (size_t i = 0; i < t.size(); ++i) + data[i] = rnd.get_random_gaussian()*sigma; + } + + class test_layer_sub_net + { + public: + test_layer_sub_net ( + dlib::rand& rnd_ + ) : rnd(rnd_) + { + // Output and gradient_input have to have the same dimensions in each + // layer. + const long num_samples = rnd.get_random_32bit_number()%4+3; + const long nr = rnd.get_random_32bit_number()%4+2; + const long nc = rnd.get_random_32bit_number()%4+2; + const long k = rnd.get_random_32bit_number()%4+2; + + output.set_size(num_samples, nr, nc, k); + gradient_input.set_size(num_samples, nr, nc, k); + + // Use a non-zero initial gradient to make sure the layers add to it + // rather than assign and blow away the initial value. + fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01); + + fill_with_gassuan_random_numbers(output, rnd); + } + + + const tensor& get_output() const { return output; } + const test_layer_sub_net& sub_net() const { init_sub(); return *sub; } + + tensor& get_gradient_input() { return gradient_input; } + test_layer_sub_net& sub_net() { init_sub(); return *sub; } + + + + unsigned long count_outputs() const + { + if (sub) + return sub->count_outputs() + output.size(); + else + return output.size(); + } + + float& get_output_element(unsigned long i) + { + if (i < output.size()) + return output.host()[i]; + else + return sub_net().get_output_element(i-output.size()); + } + + float get_gradient_input_element(unsigned long i) const + { + if (i < gradient_input.size()) + return gradient_input.host()[i]; + else + return sub_net().get_gradient_input_element(i-gradient_input.size()); + } + + + private: + // We lazily initialize sub-layers as needed when someone tries to call + // sub_net() + void init_sub() const + { + if (!sub) + sub.reset(new test_layer_sub_net(rnd)); + } + + dlib::rand& rnd; + mutable std::unique_ptr<test_layer_sub_net> sub; + resizable_tensor output; + resizable_tensor gradient_input; + }; + + + void print_tensor( + const tensor& a + ) + { + auto data = a.host(); + for (size_t i = 0; i < a.size(); ++i) + std::cout << data[i] << " "; + std::cout << std::endl; + } + } + + template < + typename layer_details_type + > + void test_layer ( + layer_details_type l + ) + { + const float base_eps = 0.01; + using namespace timpl; + // Do some setup + dlib::rand rnd; + test_layer_sub_net sub(rnd); + resizable_tensor output, out2, out3; + // Run setup() and forward() as well to make sure any calls to sub_net() have + // happened before we start assuming we know how many data elements there are + // (since we do a lazy layer creation thing based on calls to sub_net() inside + // test_layer_sub_net). + l.setup(sub); + l.forward(sub, output); + + resizable_tensor input_grad; + input_grad.copy_size(output); + std::cout << "output.num_samples(): "<< output.num_samples() << std::endl; + fill_with_gassuan_random_numbers(input_grad, rnd); + + // The f() we are computing gradients of is this thing. It's value at the current + // parameter and data values is: + std::cout << "f(data,params): " << dot(output, input_grad) << std::endl; + + // We are going to save a copy of the sub.get_gradient_input() data before we do + // backpropagation since the backward() function is supposed to *add* to the + // gradients rather than overwrite them. We will use this saved data to check if + // that is the case. + const unsigned long num_data_inputs = sub.count_outputs(); + std::vector<float> initial_gradient_input(num_data_inputs); + for (unsigned long i = 0; i < num_data_inputs; ++i) + initial_gradient_input[i] = sub.get_gradient_input_element(i); + + + // Now tell the layer to compute all the gradients. In the rest of this function + // we will just be checking that these gradients were computed correctly by + // comparing them to a central differences approximation. + resizable_tensor params_grad, random_noise; + params_grad.copy_size(l.get_layer_params()); + random_noise.copy_size(l.get_layer_params()); + randomize_parameters(random_noise, 5, rnd); + params_grad = random_noise; + l.backward(input_grad, sub, params_grad); + + running_stats<double> rs_param, rs_data; + + // ================================================================== + // first validate the way the parameter gradients are computed + for (long i = 0; i < params_grad.size(); ++i) + { + layer_details_type l1(l); + + float eps = l1.get_layer_params().host()[i]*base_eps; + if (eps == 0) + eps = base_eps; + const float oldval = l1.get_layer_params().host()[i]; + l1.get_layer_params().host()[i] = oldval+eps; + l1.forward(sub, out2); + l1.get_layer_params().host()[i] = oldval-eps; + l1.forward(sub, out3); + + // Compute a reference derivative via a central differences approximation and + // compare it to the one output by the layer and make sure they match. + double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); + double output_derivative = params_grad.host()[i]-random_noise.host()[i]; + double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100); + if (std::abs(relative_error) > 0.01) + { + using namespace std; + cout << "PARAM ERROR: "<< relative_error << endl; + cout << " reference_derivative: " << reference_derivative << endl; + cout << " output_derivative: " << output_derivative << endl; + } + + rs_param.add(std::abs(relative_error)); + } + + // ================================================================== + // now validate the data gradients + for (unsigned long i = 0; i < num_data_inputs; ++i) + { + const float oldval = sub.get_output_element(i); + float eps = oldval*base_eps; + if (eps == 0) + eps = base_eps; + sub.get_output_element(i) = oldval+eps; + l.forward(sub, out2); + sub.get_output_element(i) = oldval-eps; + l.forward(sub, out3); + + // Compute a reference derivative via a central differences approximation and + // compare it to the one output by the layer and make sure they match. + double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); + double output_derivative = sub.get_gradient_input_element(i)-initial_gradient_input[i]; + double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100); + if (std::abs(relative_error) > 0.01) + { + using namespace std; + cout << "DATA ERROR: "<< relative_error << endl; + cout << " reference_derivative: " << reference_derivative << endl; + cout << " output_derivative: " << output_derivative << endl; + } + rs_data.add(std::abs(relative_error)); + } + + using namespace std; + if (rs_param.current_n() > 1) + { + cout << "rs_param.mean(): " << rs_param.mean() << endl; + cout << "rs_param.stddev(): " << rs_param.stddev() << endl; + cout << "rs_param.max(): " << rs_param.max() << endl; + } + if (rs_data.current_n() > 1) + { + cout << "rs_data.mean(): " << rs_data.mean() << endl; + cout << "rs_data.stddev(): " << rs_data.stddev() << endl; + cout << "rs_data.max(): " << rs_data.max() << endl; + } + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename solver_type = sgd + > + class dnn_trainer + { + public: + + static_assert(is_loss_layer_type<net_type>::value, + "The last layer in a network must be a loss layer."); + + typedef typename net_type::label_type label_type; + typedef typename net_type::input_type input_type; + + dnn_trainer() + {} + + explicit dnn_trainer(const net_type& net_) : net(net_) {} + + dnn_trainer( + const net_type& net_, + const solver_type& solver_ + ) : net(net_), solvers(solver_) {} + + const net_type& get_net ( + ) const { return net; } + + void set_net ( + const net_type& net_ + ) + { + return net = net_; + } + + void set_solver ( + const solver_type& solver_ + ) + { + solvers = solver_; + } + + const sstack<solver_type,net_type::num_layers>& get_solvers ( + ) const { return solvers; } + + sstack<solver_type,net_type::num_layers>& get_solvers ( + ) { return solvers; } + + const net_type& train ( + const std::vector<input_type>& data, + const std::vector<label_type>& labels + ) + /*! + requires + - data.size() == labels.size() + !*/ + { + const int batch_size = 11; + for (int iter = 0; iter < 300; ++iter) + { + for (unsigned long i = 0; i < data.size(); i+=batch_size) + { + // TODO, move the contents of update() here and do the alternating tensor + // loading thing to hide GPU transfer latency. + std::cout << "loss: "<<net.update(data.begin()+i, + data.begin()+std::min(i+batch_size,i+data.size()-1), + labels.begin()+i, + solvers) << std::endl; + } + } + return net; + } + + const net_type& train ( + const std::vector<input_type>& data + ) + /*! + ensures + - trains an auto-encoder + !*/ + { + const bool has_unsupervised_loss = std::is_same<no_label_type, label_type>::value; + static_assert(has_unsupervised_loss, + "You can only call this version of train() when using an unsupervised loss."); + + const int batch_size = 10; + for (int iter = 0; iter < 300; ++iter) + { + for (unsigned long i = 0; i < data.size(); i+=batch_size) + { + // TODO, move the contents of update() here and do the alternating tensor + // loading thing to hide GPU transfer latency. + std::cout << "loss: "<<net.update(data.begin()+i, + data.begin()+std::min(i+batch_size,i+data.size()-1), + solvers) << std::endl; + } + } + return net; + } + + private: + + net_type net; + sstack<solver_type,net_type::num_layers> solvers; + }; + + // TODO, make dnn_trainer serializable. + +// ---------------------------------------------------------------------------------------- + +} + +#endif // #define DLIB_DNn_CORE_H_ + + diff --git a/dlib/dnn/core_abstract.h b/dlib/dnn/core_abstract.h new file mode 100644 index 0000000000000000000000000000000000000000..035c5758e09723aea9054913ab203c85809e240f --- /dev/null +++ b/dlib/dnn/core_abstract.h @@ -0,0 +1,738 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_CORE_ABSTRACT_H_ +#ifdef DLIB_DNn_CORE_ABSTRACT_H_ + +#include "tensor_abstract.h" +#include "solvers_abstract.h" +#include <memory> +#include <type_traits> +#include <dlib/statistics.h> +#include <dlib/rand.h> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + void randomize_parameters ( + tensor& params, + unsigned long num_inputs_and_outputs, + dlib::rand& rnd + ); + /*! + ensures + - This function assigns random values into params based on the given random + number generator. In particular, it uses the parameter initialization method + of formula 16 from the paper "Understanding the difficulty of training deep + feedforward neural networks" by Xavier Glorot and Yoshua Bengio. + - It is assumed that the total number of inputs and outputs from the layer is + num_inputs_and_outputs. That is, you should set num_inputs_and_outputs to + the sum of the dimensionalities of the vectors going into and out of the + layer that uses params as its parameters. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T, + size_t N + > + class sstack + { + /*! + REQUIREMENTS ON T + - T is default and copy constructable. + + REQUIREMENTS ON N + - N > 0 + + WHAT THIS OBJECT REPRESENTS + This is a basic stack of T objects. It holds N of the objects and is + entirely allocated on the stack. + !*/ + + public: + typedef T value_type; + const static size_t num_elements = N; + + sstack( + ); + /*! + ensures + - #size() == N + - All elements of this stack are default constructed. + !*/ + + sstack( + const T& item + ); + /*! + ensures + - #size() == N + - Initializes all N elements in this stack with the given item. + E.g. top()==item, pop().top()==item, pop().pop().top()==item, etc. + !*/ + + const T& top( + ) const; + /*! + ensures + - returns the top element of the stack. + !*/ + + T& top( + ); + /*! + ensures + - returns the top element of the stack. + !*/ + + + size_t size( + ) const; + /*! + ensures + - returns the number of elements in this stack. In particular, the + number returned is always N. + !*/ + + const sstack<T,N-1>& pop( + ) const; + /*! + requires + - size() > 1 + ensures + - returns a reference to the sub-stack S such that: + - S.size() == size()-1. + - S.top() is the next element in the stack. + !*/ + + sstack<T,N-1>& pop( + ); + /*! + requires + - size() > 1 + ensures + - returns a reference to the sub-stack S such that: + - S.size() == size()-1. + - S.top() is the next element in the stack. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename LAYER_DETAILS, + typename SUB_NET + > + class add_layer + { + /*! + REQUIREMENTS ON LAYER_DETAILS + - Must be a type that implements the EXAMPLE_LAYER_ interface defined in + layers_abstract.h + + REQUIREMENTS ON SUB_NET + - One of the following must be true: + - SUB_NET implements the input interface (TODO clarify) defined in + input_abstract.h. + - SUB_NET is an add_layer object. + - SUB_NET is an add_tag object. + - SUB_NET is an add_skip object. + + WHAT THIS OBJECT REPRESENTS + Stacks a new layer, defined by LAYER_DETAILS, on top of SUB_NET type. + !*/ + + public: + typedef LAYER_DETAILS layer_details_type; + typedef SUB_NET sub_net_type; + typedef typename sub_net_type::input_type input_type; + const static unsigned int sample_expansion_factor = sub_net_type::sample_expansion_factor; + // If SUB_NET is an input layer then num_layers == 1, otherwise it has the + // definition shown here: + const static size_t num_layers = sub_net_type::num_layers + 1; + + add_layer( + ); + + add_layer(const add_layer&) = default; + add_layer(add_layer&&) = default; + add_layer& operator=(add_layer&&) = default; + add_layer& operator=(const add_layer&) = default; + + + // Allow copying networks from one to another as long as their corresponding + // layers can be constructed from each other. + template <typename T, typename U> + add_layer( + const add_layer<T,U>& item + ); + /*! + ensures + - #layer_details() == layer_details_type(item.layer_details()) + - #sub_net() == sub_net_type(item.sub_net()) + !*/ + + template <typename ...T> + add_layer( + const LAYER_DETAILS& layer_det, + T&& ...args + ); + /*! + ensures + - #layer_details() == layer_details_type(layer_det) + - #sub_net() == sub_net_type(args) + !*/ + + template <typename ...T> + add_layer( + LAYER_DETAILS&& layer_det, + T&& ...args + ); + /*! + ensures + - #layer_details() == layer_details_type(layer_det) + - #sub_net() == sub_net_type(args) + !*/ + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const; + /*! + requires + - [begin, end) is an iterator range over input_type objects. + ensures + - Converts the iterator range into a tensor and stores it into #data. + - #data.num_samples() == distance(begin,end)*sample_expansion_factor. + - Invokes data.async_copy_to_device() so that the data begins transferring + to the device. + - Ultimately this function just calls sub_net().sub_net()...sub_net().to_tensor(begin,end,data). + !*/ + + template <typename input_iterator> + const tensor& operator() ( + input_iterator ibegin, + input_iterator iend + ); + /*! + ensures + - runs [ibegin,iend) through the network and returns the results. + In particular, this function performs: + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + - The return value from this function is also available in #get_output(). + - have_same_dimensions(#get_gradient_input(), #get_output()) == true + - All elements of #get_gradient_input() are set to 0. + !*/ + + + const tensor& operator() ( + const input_type& x + ); + /*! + ensures + - runs a single x through the network and returns the output. + I.e. returns (*this)(&x, &x+1); + !*/ + + const tensor& forward( + const tensor& x + ); + /*! + ensures + - Runs x through the network and returns the results. In particular, this + function performs the equivalent of: + sub_net().forward(x); + if (this is the first time forward() has been called) then + layer_details().setup(sub_net()); + layer_details().forward(sub_net(), get_output()); + - The return value from this function is also available in #get_output(). + - have_same_dimensions(#get_gradient_input(), #get_output()) == true + - All elements of #get_gradient_input() are set to 0. + !*/ + { + sub_network.forward(x); + const dimpl::sub_net_wrapper<sub_net_type> wsub(sub_network); + if (!this_layer_setup_called) + { + details.setup(wsub); + this_layer_setup_called = true; + } + details.forward(wsub, cached_output); + gradient_input_is_stale = true; + return get_output(); + } + + const tensor& get_output( + ) const; + /*! + ensures + - returns the output for the last tensor that was run through the network. + If nothing has been run through the network yet then returns an empty + tensor. + !*/ + + tensor& get_gradient_input( + ); + /*! + ensures + - + !*/ + { + if (gradient_input_is_stale) + { + gradient_input_is_stale = false; + x_grad.copy_size(get_output()); + x_grad = 0; + } + return x_grad; + } + + template <typename solver_type> + void update(const tensor& x, sstack<solver_type,num_layers>& solvers) + /*! + requires + - forward(x) was called to forward propagate x though the network. + - x.num_samples() == get_gradient_input().num_samples() + - get_gradient_input() == the gradient of the network with respect + to some loss. + !*/ + { + dimpl::sub_net_wrapper<sub_net_type> wsub(sub_network); + params_grad.copy_size(details.get_layer_params()); + params_grad = 0; + details.backward(get_gradient_input(), wsub, static_cast<tensor&>(params_grad)); + // Don't try to adjust the parameters if this layer doesn't have any. + if (params_grad.size() != 0) + solvers.top()(details, static_cast<const tensor&>(params_grad)); + sub_network.update(x, solvers.pop()); + } + + const sub_net_type& sub_net() const { return sub_network; } + sub_net_type& sub_net() { return sub_network; } + + const layer_details_type& layer_details() const { return details; } + layer_details_type& layer_details() { return details; } + + void clean( + ); + + }; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class no_label_type; + + template < + typename LOSS_DETAILS, + typename SUB_NET + > + class add_loss + { + /*! + REQUIREMENTS ON LOSS_DETAILS + - Must be a type that implements the EXAMPLE_LAYER_ interface defined in + layers_abstract.h + - LOSS_DETAILS::sample_expansion_factor == SUB_NET::sample_expansion_factor + i.e. The loss layer and input layer must agree on the sample_expansion_factor. + + REQUIREMENTS ON SUB_NET + - One of the following must be true: + - SUB_NET is an add_layer object. + - SUB_NET is an add_tag object. + - SUB_NET is an add_skip object. + + WHAT THIS OBJECT REPRESENTS + - Adds a loss layer, defined by LOSS_DETAILS, on top of SUB_NET. + !*/ + + public: + typedef LOSS_DETAILS loss_details_type; + typedef SUB_NET sub_net_type; + typedef typename sub_net_type::input_type input_type; + // Note that the loss layer doesn't count as an additional layer. + const static size_t num_layers = sub_net_type::num_layers; + const static unsigned int sample_expansion_factor = sub_net_type::sample_expansion_factor; + // If LOSS_DETAILS is an unsupervised loss then label_type==no_label_type. + // Otherwise it is defined as follows: + typedef typename LOSS_DETAILS::label_type label_type; + + static_assert(sample_expansion_factor == LOSS_DETAILS::sample_expansion_factor, + "The loss layer and input layer must agree on the sample_expansion_factor."); + + + add_loss() = default; + add_loss(const add_loss&) = default; + add_loss(add_loss&&) = default; + add_loss& operator=(add_loss&&) = default; + add_loss& operator=(const add_loss&) = default; + + template <typename T, typename U> + add_loss( + const add_loss<T,U>& item + ) : + loss(item.loss_details()), + sub(item.sub_net()) + {} + + template <typename ...T> + add_loss( + const LOSS_DETAILS& layer_det, + T&& ...args + ) : + loss(layer_det), + sub(std::forward<T>(args)...) + { + } + + template <typename ...T> + add_loss( + LOSS_DETAILS&& layer_det, + T&& ...args + ) : + loss(std::move(layer_det)), + sub(std::forward<T>(args)...) + { + } + + template <typename ...T> + add_loss( + T ...args + ) : + sub(std::move(args)...) + { + } + + template <typename input_iterator, typename output_iterator> + void operator() ( + input_iterator ibegin, + input_iterator iend, + output_iterator obegin + ) + /*! + requires + - obegin == iterator pointing to the start of a range of distance(ibegin,iend) + elements. + ensures + - runs [ibegin,iend) through the network and writes the output to the range at obegin. + !*/ + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + loss.to_label(sub, obegin); + } + + + const label_type& operator() (const input_type& x) + /*! + ensures + - runs a single x through the network and returns the output. + !*/ + { + (*this)(&x, &x+1, &temp_label); + return temp_label; + } + + + template <typename input_iterator, typename label_iterator> + double compute_loss ( + input_iterator ibegin, + input_iterator iend, + label_iterator lbegin + ) + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + dimpl::sub_net_wrapper<sub_net_type> wsub(sub); + return loss.compute_loss(temp_tensor, lbegin, wsub); + } + + template <typename input_iterator> + double compute_loss ( + input_iterator ibegin, + input_iterator iend, + ); + + template <typename input_iterator, typename label_iterator, typename solver_type> + double update ( + input_iterator ibegin, + input_iterator iend, + label_iterator lbegin, + sstack<solver_type,num_layers>& solvers + ) + { + sub.to_tensor(ibegin,iend,temp_tensor); + sub.forward(temp_tensor); + dimpl::sub_net_wrapper<sub_net_type> wsub(sub); + double l = loss.compute_loss(temp_tensor, lbegin, wsub); + sub.update(temp_tensor, solvers); + return l; + } + + template <typename input_iterator, typename solver_type> + double update ( + input_iterator ibegin, + input_iterator iend, + sstack<solver_type,num_layers>& solvers + ); + + const sub_net_type& sub_net() const { return sub; } + sub_net_type& sub_net() { return sub; } + const loss_details_type& loss_details() const { return loss; } + loss_details_type& loss_details() { return loss; } + + void clean ( + ) + /*! + ensures + - Causes the network to forget about everything but its parameters. + That is, for each layer we will have: + - get_output().num_samples() == 0 + - get_gradient_input().num_samples() == 0 + However, running new input data though this network will still have the + same output it would have had regardless of any calls to clean(). + Finally, the purpose of clean() is to compact the network object prior to + saving it to disk so that it takes up less space and the IO is quicker. + !*/ + { + temp_tensor.clear(); + sub.clear(); + } + + private: + + loss_details_type loss; + sub_net_type sub; + + // These two objects don't logically contribute to the state of this object. They + // are here to prevent them from being reallocated over and over. + label_type temp_label; + resizable_tensor temp_tensor; + }; + + + template <typename T, typename U> + struct is_layer_type<add_loss<T,U>> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + unsigned int i, + typename net_type + > + auto& layer ( + net_type& n + ); + /*! + requires + - net_type is an object of type add_layer, add_loss, add_skip, or add_tag. + ensures + - This function chains together i calls to n.sub_net() and returns the + result. So for example: + - if (i == 0) + - returns n + - else if (i == 1) + - returns n.sub_net() + - else if (i == 2) + - returns n.sub_net().sub_net() + - else if (i == 3) + - returns n.sub_net().sub_net().sub_net() + - else + - etc. + !*/ + + template < + template<typename> class Match, + typename net_type + > + auto& layer ( + net_type& n + ); + /*! + requires + - net_type is an object of type add_layer, add_loss, add_skip, or add_tag. + ensures + - returns the first layer in n that is of type Match. E.g. if net_type is + fc<relu<fc<input<sample_type>>>> then calling layer<relu>(n) would return + layer<1>(n), that is, a reference to the relu layer. + !*/ + + template < + template<typename> class Match, + unsigned int i, + typename net_type + > + auto& layer ( + net_type& n + ); + /*! + requires + - net_type is an object of type add_layer, add_loss, add_skip, or add_tag. + ensures + - returns layer<i>(layer<Match>(n)) + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + unsigned long ID, + typename SUB_NET + > + class add_tag + { + /*! + WHAT THIS OBJECT REPRESENTS + This object draws its inputs from sub_net() and performs the identity + transform. This means it is a no-op and its presence does not change + the behavior of the network. It exists solely to be used by add_skip + to reference a particular part of a network. + + !*/ + }; + + template <typename SUB_NET> using tag1 = add_tag< 1, SUB_NET>; + template <typename SUB_NET> using tag2 = add_tag< 2, SUB_NET>; + template <typename SUB_NET> using tag3 = add_tag< 3, SUB_NET>; + template <typename SUB_NET> using tag4 = add_tag< 4, SUB_NET>; + template <typename SUB_NET> using tag5 = add_tag< 5, SUB_NET>; + template <typename SUB_NET> using tag6 = add_tag< 6, SUB_NET>; + template <typename SUB_NET> using tag7 = add_tag< 7, SUB_NET>; + template <typename SUB_NET> using tag8 = add_tag< 8, SUB_NET>; + template <typename SUB_NET> using tag9 = add_tag< 9, SUB_NET>; + template <typename SUB_NET> using tag10 = add_tag<10, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class TAG_TYPE, + typename SUB_NET + > + class add_skip + { + /*! + WHAT THIS OBJECT REPRESENTS + This object draws its inputs from layer<TAG_TYPE>(sub_net()) + and performs the identity transform. + !*/ + }; + + template <typename SUB_NET> using skip1 = add_skip< tag1, SUB_NET>; + template <typename SUB_NET> using skip2 = add_skip< tag2, SUB_NET>; + template <typename SUB_NET> using skip3 = add_skip< tag3, SUB_NET>; + template <typename SUB_NET> using skip4 = add_skip< tag4, SUB_NET>; + template <typename SUB_NET> using skip5 = add_skip< tag5, SUB_NET>; + template <typename SUB_NET> using skip6 = add_skip< tag6, SUB_NET>; + template <typename SUB_NET> using skip7 = add_skip< tag7, SUB_NET>; + template <typename SUB_NET> using skip8 = add_skip< tag8, SUB_NET>; + template <typename SUB_NET> using skip9 = add_skip< tag9, SUB_NET>; + template <typename SUB_NET> using skip10 = add_skip<tag10, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + template < + typename layer_details_type + > + void test_layer ( + layer_details_type l + ); + /*! + requires + - l implements the EXAMPLE_LAYER_ interface defined in layers_abstract.h + ensures + - tests l for compliance against the EXAMPLE_LAYER_ interface spec. + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename solver_type = sgd + > + class dnn_trainer + { + /*! + REQUIREMENTS ON net_type + - net_type is an add_loss object. + + REQUIREMENTS ON solver_type + - solver_type is an implementation of the EXAMPLE_SOLVER interface defined + in solvers_abstract.h + + WHAT THIS OBJECT REPRESENTS + + !*/ + + public: + + typedef typename net_type::label_type label_type; + typedef typename net_type::input_type input_type; + + dnn_trainer( + ); + + explicit dnn_trainer( + const net_type& net + ); + + dnn_trainer( + const net_type& net, + const solver_type& solver + ); + + const net_type& get_net ( + ) const; + + void set_net ( + const net_type& net + ); + + void set_solver ( + const solver_type& solver_ + ); + + const sstack<solver_type,net_type::num_layers>& get_solvers ( + ) const; + + sstack<solver_type,net_type::num_layers>& get_solvers ( + ); + + const net_type& train ( + const std::vector<input_type>& data, + const std::vector<label_type>& labels + ); + /*! + requires + - data.size() == labels.size() + - TODO: the net has a supervised loss layer. + !*/ + + const net_type& train ( + const std::vector<input_type>& data + ); + /*! + requires + - TODO: the net has an unsupervised loss layer. + ensures + - trains an auto-encoder + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_CORE_ABSTRACT_H_ DLIB_DNn_CORE_H_ + diff --git a/dlib/dnn/input.h b/dlib/dnn/input.h new file mode 100644 index 0000000000000000000000000000000000000000..11389b1fe602fc4afeaf3bb145dba4fc1c6b36ef --- /dev/null +++ b/dlib/dnn/input.h @@ -0,0 +1,137 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_INPUT_H_ +#define DLIB_DNn_INPUT_H_ + +#include <dlib/matrix.h> +#include <dlib/pixel.h> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template <typename T> + class input + { + public: + + // sample_expansion_factor must be > 0 + const static unsigned int sample_expansion_factor = 1; + typedef T input_type; + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const + /*! + requires + - [begin, end) is an iterator range over input_type objects. + ensures + - Converts the iterator range into a tensor and stores it into #data. + - Normally you would have #data.num_samples() == distance(begin,end) but + you can also expand the output by some integer factor so long as the loss + you use can deal with it correctly. + - #data.num_samples() == distance(begin,end)*sample_expansion_factor. + !*/ + { + // initialize data to the right size to contain the stuff in the iterator range. + + for (input_iterator i = begin; i != end; ++i) + { + matrix<rgb_pixel> temp = *i; + // now copy *i into the right part of data. + } + } + }; + +// ---------------------------------------------------------------------------------------- + + template <typename T,long NR, typename MM, typename L> + class input<matrix<T,NR,1,MM,L>> + { + public: + + // TODO, maybe we should only allow T to be float? Seems kinda pointless to allow + // double. Don't forget to remove the matrix_cast if we enforce just float. + typedef matrix<T,NR,1,MM,L> input_type; + const static unsigned int sample_expansion_factor = 1; + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const + /*! + requires + - [begin, end) is an iterator range over input_type objects. + ensures + - converts the iterator range into a tensor and stores it into #data. + - Normally you would have #data.num_samples() == distance(begin,end) but + you can also expand the output by some integer factor so long as the loss + you use can deal with it correctly. + - #data.num_samples() == distance(begin,end)*sample_expansion_factor. + !*/ + { + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(std::distance(begin,end), 1, 1, begin->size()); + + unsigned long idx = 0; + for (input_iterator i = begin; i != end; ++i) + { + data.set_sample(idx++, matrix_cast<float>(*i)); + } + } + }; + +// ---------------------------------------------------------------------------------------- + + template <typename T> + class input2 + { + public: + + input2(){} + + input2(const input<T>&) {} + + typedef T input_type; + const static unsigned int sample_expansion_factor = 1; + + template <typename input_iterator> + void to_tensor ( + input_iterator begin, + input_iterator end, + resizable_tensor& data + ) const + /*! + requires + - [begin, end) is an iterator range over T objects. + ensures + - converts the iterator range into a tensor and stores it into #data. + - Normally you would have #data.num_samples() == distance(begin,end) but + you can also expand the output by some integer factor so long as the loss + you use can deal with it correctly. + - #data.num_samples() == distance(begin,end)*K where K is an integer >= 1. + !*/ + { + // initialize data to the right size to contain the stuff in the iterator range. + + for (input_iterator i = begin; i != end; ++i) + { + matrix<rgb_pixel> temp = *i; + // now copy *i into the right part of data. + } + } + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // #define DLIB_DNn_INPUT_H_ + diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h new file mode 100644 index 0000000000000000000000000000000000000000..47a8de7a64fc21bc803ab2026b44a32a7c5f9220 --- /dev/null +++ b/dlib/dnn/layers.h @@ -0,0 +1,242 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_LAYERS_H_ +#define DLIB_DNn_LAYERS_H_ + +#include "layers_abstract.h" +#include "tensor.h" +#include "core.h" +#include <iostream> +#include <string> +#include <dlib/rand.h> +#include <dlib/string.h> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class con_ + { + public: + con_() + {} + + template <typename SUB_NET> + void setup (const SUB_NET& sub) + { + // TODO + } + + template <typename SUB_NET> + void forward(const SUB_NET& sub, resizable_tensor& output) + { + // TODO + } + + template <typename SUB_NET> + void backward(const tensor& gradient_input, SUB_NET& sub, tensor& params_grad) + { + // TODO + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + private: + + resizable_tensor params; + }; + + template <typename SUB_NET> + using con = add_layer<con_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + class fc_ + { + public: + fc_() : num_outputs(1) + { + rnd.set_seed("fc_" + cast_to_string(num_outputs)); + } + + explicit fc_(unsigned long num_outputs_) + { + num_outputs = num_outputs_; + rnd.set_seed("fc_" + cast_to_string(num_outputs)); + } + + unsigned long get_num_outputs ( + ) const { return num_outputs; } + + template <typename SUB_NET> + void setup (const SUB_NET& sub) + { + num_inputs = sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k(); + params.set_size(num_inputs, num_outputs); + + std::cout << "fc_::setup() " << params.size() << std::endl; + + randomize_parameters(params, num_inputs+num_outputs, rnd); + } + + template <typename SUB_NET> + void forward(const SUB_NET& sub, resizable_tensor& output) + { + output.set_size(sub.get_output().num_samples(), num_outputs); + + output = mat(sub.get_output())*mat(params); + } + + template <typename SUB_NET> + void backward(const tensor& gradient_input, SUB_NET& sub, tensor& params_grad) + { + // d1*W*p1 + d2*W*p2 + // total gradient = [d1*W; d2*W; d3*W; ...] == D*W + + + // compute the gradient of the parameters. + params_grad += trans(mat(sub.get_output()))*mat(gradient_input); + + // compute the gradient for the data + sub.get_gradient_input() += mat(gradient_input)*trans(mat(params)); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + private: + + unsigned long num_outputs; + unsigned long num_inputs; + resizable_tensor params; + dlib::rand rnd; + }; + + + template <typename SUB_NET> + using fc = add_layer<fc_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + class relu_ + { + public: + relu_() + { + } + + template <typename SUB_NET> + void setup (const SUB_NET& sub) + { + } + + template <typename SUB_NET> + void forward(const SUB_NET& sub, resizable_tensor& output) + { + output.copy_size(sub.get_output()); + output = lowerbound(mat(sub.get_output()), 0); + } + + template <typename SUB_NET> + void backward(const tensor& gradient_input, SUB_NET& sub, tensor& params_grad) + { + const float* grad = gradient_input.host(); + const float* in = sub.get_output().host(); + float* out = sub.get_gradient_input().host(); + for (unsigned long i = 0; i < sub.get_output().size(); ++i) + { + if (in[i] > 0) + out[i] += grad[i]; + } + + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + private: + + resizable_tensor params; + }; + + + template <typename SUB_NET> + using relu = add_layer<relu_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + class multiply_ + { + public: + multiply_() + { + } + + + template <typename SUB_NET> + void setup (const SUB_NET& sub) + { + num_inputs = sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k(); + params.set_size(1, num_inputs); + + std::cout << "multiply_::setup() " << params.size() << std::endl; + + const int num_outputs = num_inputs; + + randomize_parameters(params, num_inputs+num_outputs, rnd); + } + + template <typename SUB_NET> + void forward(const SUB_NET& sub, resizable_tensor& output) + { + DLIB_CASSERT( sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k() == params.size(), ""); + DLIB_CASSERT( sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k() == num_inputs, ""); + + output.copy_size(sub.get_output()); + auto indata = sub.get_output().host(); + auto outdata = output.host(); + auto paramdata = params.host(); + for (int i = 0; i < sub.get_output().num_samples(); ++i) + { + for (int j = 0; j < num_inputs; ++j) + { + *outdata++ = *indata++ * paramdata[j]; + } + } + } + + template <typename SUB_NET> + void backward(const tensor& gradient_input, SUB_NET& sub, tensor& params_grad) + { + params_grad += sum_rows(pointwise_multiply(mat(sub.get_output()),mat(gradient_input))); + + for (long i = 0; i < gradient_input.num_samples(); ++i) + { + sub.get_gradient_input().add_to_sample(i, + pointwise_multiply(rowm(mat(gradient_input),i), mat(params))); + } + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + private: + + int num_inputs; + resizable_tensor params; + dlib::rand rnd; + }; + + template <typename SUB_NET> + using multiply = add_layer<multiply_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // #define DLIB_DNn_LAYERS_H_ + + diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h new file mode 100644 index 0000000000000000000000000000000000000000..847305424f56456d69d6d779e2cedb6ea98e5381 --- /dev/null +++ b/dlib/dnn/layers_abstract.h @@ -0,0 +1,238 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_LAYERS_ABSTRACT_H_ +#ifdef DLIB_DNn_LAYERS_ABSTRACT_H_ + +#include "tensor_abstract.h" +#include "core_abstract.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class SUB_NET + { + /*! + WHAT THIS OBJECT REPRESENTS + + By "Sub net" we mean the part of the network closer to the input. Whenever + you get a SUB_NET it will always have computed its outputs and they will be + available in get_output(). + + !*/ + + public: + + const tensor& get_output( + ) const; + + tensor& get_gradient_input( + ); + + const NEXT_SUB_NET& sub_net( + ) const; + + NEXT_SUB_NET& sub_net( + ); + }; + +// ---------------------------------------------------------------------------------------- + + class EXAMPLE_LAYER_ + { + /*! + WHAT THIS OBJECT REPRESENTS + Each layer in a deep neural network can be thought of as a function, + f(data,parameters), that takes in a data tensor, some parameters, and + produces an output tensor. You create an entire deep network by composing + these functions. Importantly, you are able to use a wide range of + different functions to accommodate whatever task you are trying to accomplish. + Dlib includes a number of common layer types but if you want to define your + own then you simply implement a class with the same interface as EXAMPLE_LAYER_. + + !*/ + + public: + + EXAMPLE_LAYER_( + ); + /*! + ensures + - Default constructs this object. This function is not required to do + anything in particular but it is required that layer objects be default + constructable. + !*/ + + template <typename SUB_NET> + void setup ( + const SUB_NET& sub + ); + /*! + requires + - SUB_NET implements the SUB_NET interface defined at the top of this file. + ensures + - performs any necessary initial memory allocations and/or sets parameters + to their initial values prior to learning. Therefore, calling setup + destroys any previously learned parameters. + !*/ + + template <typename SUB_NET> + void forward( + const SUB_NET& sub, + resizable_tensor& output + ); + /*! + requires + - SUB_NET implements the SUB_NET interface defined at the top of this file. + - setup() has been called. + ensures + - Runs the output of the sub-network through this layer and stores the + output into #output. In particular, forward() can use any of the outputs + in sub (e.g. sub.get_output(), sub.sub_net().get_output(), etc.) to + compute whatever it wants. + - #output.num_samples() == sub.get_output().num_samples() + !*/ + + template <typename SUB_NET> + void backward( + const tensor& gradient_input, + SUB_NET& sub, + tensor& params_grad + ); + /*! + requires + - SUB_NET implements the SUB_NET interface defined at the top of this file. + - setup() has been called. + - gradient_input has the same dimensions as the output of forward(sub,output). + - have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true + - have_same_dimensions(params_grad, get_layer_params()) == true + ensures + - This function outputs the gradients of this layer with respect to the + input data from sub and also with respect to this layer's parameters. + These gradients are stored into #sub and #params_grad, respectively. To be + precise, the gradients are taken of a function f(sub,get_layer_params()) + which is defined thusly: + - let OUT be the output of forward(sub,OUT). + - let f(sub,get_layer_params()) == dot(OUT, gradient_input) + Then we define the following gradient vectors: + - PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with + respect to get_layer_params(). + - for all valid I: + - DATA_GRADIENT_I == gradient of f(sub,get_layer_params()) with + respect to layer<I>(sub).get_output() (recall that forward() can + draw inputs from the immediate sub layer, sub.sub_net(), or + any earlier layer. So you must consider the gradients with + respect to all inputs drawn from sub) + Finally, backward() adds these gradients into the output by performing: + - params_grad += PARAMETER_GRADIENT + - for all valid I: + - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I + !*/ + + const tensor& get_layer_params( + ) const; + /*! + ensures + - returns the parameters that define the behavior of forward(). + !*/ + + tensor& get_layer_params( + ); + /*! + ensures + - returns the parameters that define the behavior of forward(). + !*/ + + }; + + // For each layer you define, always define an add_layer template so that layers can be + // easily composed. Moreover, the convention is that the layer class ends with an _ + // while the add_layer template has the same name but without the trailing _. + template <typename SUB_NET> + using EXAMPLE_LAYER = add_layer<EXAMPLE_LAYER_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class fc_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_LAYER_ interface defined above. + In particular, it defines a fully connected layer that takes an input + tensor and multiplies it by a weight matrix and outputs the results. + !*/ + + public: + fc_( + ); + /*! + ensures + - #get_num_outputs() == 1 + !*/ + + explicit fc_( + unsigned long num_outputs + ); + /*! + ensures + - #get_num_outputs() == num_outputs + !*/ + + unsigned long get_num_outputs ( + ) const; + /*! + ensures + - This layer outputs column vectors that contain get_num_outputs() + elements. That is, the output tensor T from forward() will be such that: + - T.num_samples() == however many samples were given to forward(). + - T.nr() == get_num_outputs() + - The rest of the dimensions of T will be 1. + !*/ + + template <typename SUB_NET> void setup (const SUB_NET& sub); + template <typename SUB_NET> void forward(const SUB_NET& sub, resizable_tensor& output); + template <typename SUB_NET> void backward(const tensor& gradient_input, SUB_NET& sub, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_LAYER_ interface. + !*/ + }; + + + template <typename SUB_NET> + using fc = add_layer<fc_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + class relu_ + { + public: + + relu_( + ); + + template <typename SUB_NET> void setup (const SUB_NET& sub); + template <typename SUB_NET> void forward(const SUB_NET& sub, resizable_tensor& output); + template <typename SUB_NET> void backward(const tensor& gradient_input, SUB_NET& sub, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_LAYER_ interface. + !*/ + }; + + + template <typename SUB_NET> + using relu = add_layer<relu_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // #define DLIB_DNn_LAYERS_H_ + diff --git a/dlib/dnn/loss.h b/dlib/dnn/loss.h new file mode 100644 index 0000000000000000000000000000000000000000..b89c3fcf519a7e7e6fbc4987921040c299fb0e29 --- /dev/null +++ b/dlib/dnn/loss.h @@ -0,0 +1,175 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_LOSS_H_ +#define DLIB_DNn_LOSS_H_ + +#include "core.h" +#include <dlib/matrix.h> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class loss_binary_hinge_ + { + public: + + const static unsigned int sample_expansion_factor = 1; + typedef double label_type; + + // Implementing to_label() is optional. If you don't do it then it just means the + // automatic operator() mapping from tensors to outputs is missing from the net object. + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const SUB_TYPE& sub, + label_iterator iter + ) const + /*! + requires + - SUB_NET implements the SUB_NET interface defined at the top of layers_abstract.h. + - sub.get_output().num_samples() must be a multiple of sample_expansion_factor. + - iter == an iterator pointing to the beginning of a range of + sub.get_output().num_samples()/sample_expansion_factor elements. In + particular, they must be label_type elements. + !*/ + { + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1,""); + DLIB_CASSERT(output_tensor.num_samples()%sample_expansion_factor == 0,""); + + const float* out_data = output_tensor.host(); + for (unsigned long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter++ = out_data[i]; + } + } + + template < + typename label_iterator, + typename SUB_NET + > + double compute_loss ( + const tensor& input_tensor, + label_iterator truth, // TODO, this parameter is optional. + SUB_NET& sub + ) const + /*! + requires + - SUB_NET implements the SUB_NET interface defined at the top of layers_abstract.h. + - input_tensor was given as input to the network sub and the outputs are now + visible in sub.get_output(), sub.sub_net().get_output(), etc. + - input_tensor.num_samples() must be a multiple of sample_expansion_factor. + - input_tensor.num_samples() == sub.get_output().num_samples() == grad.num_samples() + - truth == an iterator pointing to the beginning of a range of + input_tensor.num_samples()/sample_expansion_factor elements. In particular, + they must be label_type elements. + - sub.get_gradient_input() has the same dimensions as sub.get_output(). + - for all valid i: + - *(truth+i/sample_expansion_factor) is the label of the ith sample in + sub.get_output(). + ensures + - #sub.get_gradient_input() == the gradient of the loss with respect to + sub.get_output(). + !*/ + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + // TODO, throw an exception instead of asserting, probably... + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples(),""); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples(),""); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1,""); + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + const float* out_data = output_tensor.host(); + float* g = grad.host(); + for (unsigned long i = 0; i < output_tensor.num_samples(); ++i) + { + const float y = *truth++; + const float temp = 1-y*out_data[i]; + if (temp > 0) + { + loss += scale*temp; + g[i] = -scale*y; + } + else + { + g[i] = 0; + } + } + return loss; + } + + }; + +// ---------------------------------------------------------------------------------------- + + template <typename SUB_NET> + using loss_binary_hinge = add_loss<loss_binary_hinge_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + + class loss_no_label_ + { + public: + + //typedef int label_type; + + const static unsigned int sample_expansion_factor = 1; + + + template < + typename SUB_NET + > + double compute_loss ( + const tensor& input_tensor, + SUB_NET& sub + ) const + /*! + requires + - SUB_NET implements the SUB_NET interface defined at the top of layers_abstract.h. + - input_tensor was given as input to the network sub and the outputs are now + visible in sub.get_output(), sub.sub_net().get_output(), etc. + - input_tensor.num_samples() must be a multiple of sample_expansion_factor. + - input_tensor.num_samples() == sub.get_output().num_samples() == grad.num_samples() + - truth == an iterator pointing to the beginning of a range of + input_tensor.num_samples()/sample_expansion_factor elements. In particular, + they must be label_type elements. + - sub.get_gradient_input() has the same dimensions as sub.get_output(). + - for all valid i: + - *(truth+i/sample_expansion_factor) is the label of the ith sample in + sub.get_output(). + ensures + - #sub.get_gradient_input() == the gradient of the loss with respect to + sub.get_output(). + !*/ + { + return 0; + } + + }; + +// ---------------------------------------------------------------------------------------- + + template <typename SUB_NET> + using loss_no_label = add_loss<loss_no_label_, SUB_NET>; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // #define DLIB_DNn_LOSS_H_ + + diff --git a/dlib/dnn/solvers.h b/dlib/dnn/solvers.h new file mode 100644 index 0000000000000000000000000000000000000000..6bb84f0230f563f2f49bfde40568315b3182bac7 --- /dev/null +++ b/dlib/dnn/solvers.h @@ -0,0 +1,55 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_SOLVERS_H_ +#define DLIB_DNn_SOLVERS_H_ + +#include "tensor.h" +#include <iostream> + +namespace dlib +{ + /* + class EXAMPLE_SOLVER + { + }; + */ + + struct sgd + { + + matrix<float> v; + float weight_decay; + float eps; + float momentum; + sgd(double eps_ = 0.001) + { + weight_decay = 0.0005; + eps = eps_; + //eps = 0.001; + momentum = 0.9; + } + + template <typename layer_type> + void operator() (layer_type& l, const tensor& params_grad) + /*! + requires + - l.get_layer_params().size() != 0 + - l.get_layer_params() and params_grad have the same dimensions. + !*/ + { + if (v.size() != 0) + v = momentum*v - weight_decay*eps*mat(l.get_layer_params()) - eps*mat(params_grad); + else + v = - weight_decay*eps*mat(l.get_layer_params()) - eps*mat(params_grad); + + l.get_layer_params() += v; + } + }; + + +} + +#endif // #define DLIB_DNn_SOLVERS_H_ + + + diff --git a/dlib/dnn/tensor.h b/dlib/dnn/tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..4d2e0b1f8ee824bb04393d83e082deba05217fed --- /dev/null +++ b/dlib/dnn/tensor.h @@ -0,0 +1,458 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_TENSOR_H_ +#define DLIB_DNn_TENSOR_H_ + +#include <memory> +#include <cstring> +#include <dlib/matrix.h> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class gpu_data + { + /*! + CONVENTION + - if (size() != 0) then + - data_host == a pointer to size() floats in CPU memory. + - if (data_device) then + - data_device == a pointer to size() floats in device memory. + + + - We use the host_current and device_current bools to keep track of which + copy of the data (or both) are most current. e.g. if the CPU has + modified the tensor and it hasn't been copied to the device yet then + host_current==true and device_current == false. + + !*/ + public: + + gpu_data( + ) : data_size(0), host_current(true), device_current(false) + { + } + + // Not copyable + gpu_data(const gpu_data&) = delete; + gpu_data& operator=(const gpu_data&) = delete; + + // but is movable + gpu_data(gpu_data&&) = default; + gpu_data& operator=(gpu_data&&) = default; + + void set_size(size_t new_size) + { + if (new_size == 0) + { + data_size = 0; + host_current = true; + device_current = false; + data_host.reset(); + data_device.reset(); + } + else if (new_size != data_size) + { + data_size = new_size; + host_current = true; + device_current = false; + data_host.reset(new float[new_size]); + data_device.reset(); + } + } + + void async_copy_to_device() + { + // TODO + } + + void async_copy_to_host() + { + // TODO + } + + const float* host() const + { + copy_to_host(); + return data_host.get(); + } + + float* host() + { + copy_to_host(); + device_current = false; + return data_host.get(); + } + + const float* device() const + { + copy_to_device(); + return data_device.get(); + } + + float* device() + { + copy_to_device(); + host_current = false; + return data_device.get(); + } + + size_t size() const { return data_size; } + + private: + + void copy_to_device() const + { + if (!device_current) + { + // TODO, cudamemcpy() + device_current = true; + } + } + + void copy_to_host() const + { + if (!host_current) + { + // TODO, cudamemcpy() + host_current = true; + } + } + + size_t data_size; + mutable bool host_current; + mutable bool device_current; + + std::unique_ptr<float[]> data_host; + std::unique_ptr<float[]> data_device; + }; + +// ---------------------------------------------------------------------------------------- + + class tensor + { + /*! + WHAT THIS OBJECT REPRESENTS + !*/ + + public: + + tensor ( + ) : + m_n(0), m_nr(0), m_nc(0), m_k(0) + { + } + + inline virtual ~tensor() = 0; + + long num_samples() const { return m_n; } + long nr() const { return m_nr; } + long nc() const { return m_nc; } + long k() const { return m_k; } + size_t size() const { return data.size(); } + + void async_copy_to_host() + { + data.async_copy_to_host(); + } + + void async_copy_to_device() + { + data.async_copy_to_device(); + } + /*! + ensures + - begin asynchronously copying this tensor to the GPU. + + NOTE that the "get device pointer" routine in this class + will have to do some kind of synchronization that ensures + the copy is finished. + !*/ + + const float* host() const { return data.host(); } + float* host() { return data.host(); } + const float* device() const { return data.device(); } + float* device() { return data.device(); } + + tensor& operator= (float val) + { + // TODO, do on the device if that's where the memory is living right now. + auto d = data.host(); + for (size_t i = 0; i < data.size(); ++i) + d[i] = val; + } + + template <typename EXP> + tensor& operator= (const matrix_exp<EXP>& item) + { + DLIB_CASSERT(num_samples() == item.nr() && + nr()*nc()*k() == item.nc(),""); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + + set_ptrm(data.host(), m_n, m_nr*m_nc*m_k) = item; + return *this; + } + + template <typename EXP> + tensor& operator+= (const matrix_exp<EXP>& item) + { + DLIB_CASSERT(num_samples() == item.nr() && + nr()*nc()*k() == item.nc(),""); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(data.host(), m_n, m_nr*m_nc*m_k) += item; + return *this; + } + + template <typename EXP> + tensor& operator-= (const matrix_exp<EXP>& item) + { + DLIB_CASSERT(num_samples() == item.nr() && + nr()*nc()*k() == item.nc(),""); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(data.host(), m_n, m_nr*m_nc*m_k) -= item; + return *this; + } + + template <typename EXP> + void set_sample ( + unsigned long idx, + const matrix_exp<EXP>& item + ) + { + DLIB_CASSERT(idx < num_samples(), ""); + DLIB_CASSERT(item.size() == nr()*nc()*k(), ""); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(data.host()+idx*item.size(), item.nr(), item.nc()) = item; + } + + + template <typename EXP> + void add_to_sample ( + unsigned long idx, + const matrix_exp<EXP>& item + ) + { + DLIB_CASSERT(idx < num_samples(), ""); + DLIB_CASSERT(item.size() == nr()*nc()*k(), ""); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(data.host()+idx*item.size(), item.nr(), item.nc()) += item; + } + + + + + protected: + + tensor& operator= (const tensor& item) + { + m_n = item.m_n; + m_nr = item.m_nr; + m_nc = item.m_nc; + m_k = item.m_k; + data.set_size(item.data.size()); + std::memcpy(data.host(), item.data.host(), data.size()*sizeof(float)); + return *this; + } + + tensor( + const tensor& item + ) + { + *this = item; + } + + tensor(tensor&& item) = default; + tensor& operator=(tensor&& item) = default; + + + long m_n; + long m_nr; + long m_nc; + long m_k; + gpu_data data; + }; + + tensor::~tensor() + { + } + +// ---------------------------------------------------------------------------------------- + + const matrix_op<op_pointer_to_mat<float> > mat ( + const tensor& t, + long nr, + long nc + ) + { + DLIB_ASSERT(nr > 0 && nc > 0 , + "\tconst matrix_exp mat(tensor, nr, nc)" + << "\n\t nr and nc must be bigger than 0" + << "\n\t nr: " << nr + << "\n\t nc: " << nc + ); + DLIB_ASSERT(nr*nc == t.size() , + "\tconst matrix_exp mat(tensor, nr, nc)" + << "\n\t The sizes don't match up." + << "\n\t nr*nc: " << nr*nc + << "\n\t t.size(): " << t.size() + ); + typedef op_pointer_to_mat<float> op; + return matrix_op<op>(op(t.host(),nr,nc)); + } + + const matrix_op<op_pointer_to_mat<float> > mat ( + const tensor& t + ) + { + DLIB_ASSERT(t.size() != 0, + "\tconst matrix_exp mat(tensor)" + << "\n\t The tensor can't be empty." + ); + + return mat(t, t.num_samples(), t.size()/t.num_samples()); + } + +// ---------------------------------------------------------------------------------------- + + inline bool have_same_dimensions ( + const tensor& a, + const tensor& b + ) + { + return a.num_samples() == b.num_samples() && + a.nr() == b.nr() && + a.nc() == b.nc() && + a.k() == b.k(); + } + +// ---------------------------------------------------------------------------------------- + + class resizable_tensor : public tensor + { + public: + resizable_tensor( + ) + {} + + explicit resizable_tensor( + long n_, long nr_ = 1, long nc_ = 1, long k_ = 1 + ) + { + set_size(n_,nr_,nc_,k_); + } + + resizable_tensor(const resizable_tensor&) = default; + resizable_tensor(resizable_tensor&&) = default; + + void clear( + ) + { + set_size(0,0,0,0); + } + + void copy_size ( + const tensor& item + ) + /*! + ensures + - resizes *this so that: have_same_dimensions(#*this, item)==true + !*/ + { + set_size(item.num_samples(), item.nr(), item.nc(), item.k()); + } + + resizable_tensor& operator= (float val) + { + tensor::operator=(val); + return *this; + } + + template <typename EXP> + resizable_tensor& operator= (const matrix_exp<EXP>& item) + { + tensor::operator=(item); + return *this; + } + + template <typename EXP> + resizable_tensor& operator+= (const matrix_exp<EXP>& item) + { + tensor::operator+=(item); + return *this; + } + + template <typename EXP> + resizable_tensor& operator-= (const matrix_exp<EXP>& item) + { + tensor::operator-=(item); + return *this; + } + + template <typename EXP> + void set_sample ( + unsigned long idx, + const matrix_exp<EXP>& item + ) + { + tensor::set_sample(idx, item); + } + + template <typename EXP> + void add_to_sample ( + unsigned long idx, + const matrix_exp<EXP>& item + ) + { + tensor::add_to_sample(idx, item); + } + + resizable_tensor& operator= (const resizable_tensor&) = default; + resizable_tensor& operator= (resizable_tensor&&) = default; + + resizable_tensor& operator= (const tensor& x) + { + tensor::operator=(x); + return *this; + } + + void set_size( + long n_, long nr_ = 1, long nc_ = 1, long k_ = 1 + ) + { + m_n = n_; + m_nr = nr_; + m_nc = nc_; + m_k = k_; + data.set_size(m_n*m_nr*m_nc*m_k); + } + }; + +// ---------------------------------------------------------------------------------------- + + inline double dot( + const tensor& a, + const tensor& b + ) + { + DLIB_CASSERT(a.size() == b.size(), ""); + const float* da = a.host(); + const float* db = b.host(); + double sum = 0; + for (size_t i = 0; i < a.size(); ++i) + sum += da[i]*db[i]; + return sum; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_TENSOR_H_ +