Commit 3b75b335 authored by Davis King's avatar Davis King

Gave dnn_trainer the ability to train on out of core data by adding the

train_one_step() member function.  Also improved how the host to device transfers
are overlapped with kernel computation.
parent adec3eef
...@@ -1284,8 +1284,9 @@ namespace dlib ...@@ -1284,8 +1284,9 @@ namespace dlib
// "no label". So here we make the constructor private with the exception that // "no label". So here we make the constructor private with the exception that
// add_loss_layer objects can make it (again, just to simplify add_loss_layer's // add_loss_layer objects can make it (again, just to simplify add_loss_layer's
// implementation). // implementation).
no_label_type()=default; no_label_type(){};
template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer; template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
template < typename net_type, typename solver_type > friend class dnn_trainer;
}; };
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -10,6 +10,23 @@ namespace dlib ...@@ -10,6 +10,23 @@ namespace dlib
namespace cuda namespace cuda
{ {
// -----------------------------------------------------------------------------------
void set_device (
int dev
)
{
CHECK_CUDA(cudaSetDevice(dev));
}
int get_device (
)
{
int dev = 0;
CHECK_CUDA(cudaGetDevice(&dev));
return dev;
}
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
__global__ void _cuda_multiply(float* d, const float* s, size_t n) __global__ void _cuda_multiply(float* d, const float* s, size_t n)
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
#ifndef DLIB_DNN_CuDA_H_ #ifndef DLIB_DNN_CuDA_H_
#define DLIB_DNN_CuDA_H_ #define DLIB_DNN_CuDA_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h" #include "tensor.h"
...@@ -12,6 +11,17 @@ namespace dlib ...@@ -12,6 +11,17 @@ namespace dlib
namespace cuda namespace cuda
{ {
#ifdef DLIB_USE_CUDA
// ----------------------------------------------------------------------------------------
void set_device (
int dev
);
int get_device (
);
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void multiply ( void multiply (
...@@ -120,11 +130,24 @@ namespace dlib ...@@ -120,11 +130,24 @@ namespace dlib
); );
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
#else // if DLIB_USE_CUDA NOT DEFINED
inline void set_device (
int dev
){}
inline int get_device (
){}
#endif // DLIB_USE_CUDA
} }
} }
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDA_H_ #endif // DLIB_DNN_CuDA_H_
...@@ -11,6 +11,10 @@ ...@@ -11,6 +11,10 @@
#include <chrono> #include <chrono>
#include "../serialize.h" #include "../serialize.h"
#include "../pipe.h"
#include "../threads.h"
#include "cuda_dlib.h"
namespace dlib namespace dlib
{ {
...@@ -20,7 +24,7 @@ namespace dlib ...@@ -20,7 +24,7 @@ namespace dlib
typename net_type, typename net_type,
typename solver_type = sgd typename solver_type = sgd
> >
class dnn_trainer class dnn_trainer : private threaded_object
{ {
public: public:
...@@ -31,12 +35,12 @@ namespace dlib ...@@ -31,12 +35,12 @@ namespace dlib
typedef typename net_type::input_type input_type; typedef typename net_type::input_type input_type;
dnn_trainer( dnn_trainer(
) ) : job_pipe(0)
{ {
init(); init();
} }
explicit dnn_trainer(const net_type& net_) : net(net_) explicit dnn_trainer(const net_type& net_) : job_pipe(0), net(net_)
{ {
init(); init();
} }
...@@ -44,18 +48,31 @@ namespace dlib ...@@ -44,18 +48,31 @@ namespace dlib
dnn_trainer( dnn_trainer(
const net_type& net_, const net_type& net_,
const solver_type& solver_ const solver_type& solver_
) : net(net_), solvers(solver_) ) : job_pipe(0), net(net_), solvers(solver_)
{ {
init(); init();
} }
~dnn_trainer(
)
{
job_pipe.disable();
stop();
wait();
}
const net_type& get_net ( const net_type& get_net (
) const { return net; } ) const
{
wait_for_thread_to_pause();
return net;
}
void set_net ( void set_net (
const net_type& net_ const net_type& net_
) )
{ {
wait_for_thread_to_pause();
return net = net_; return net = net_;
} }
...@@ -63,6 +80,7 @@ namespace dlib ...@@ -63,6 +80,7 @@ namespace dlib
const solver_type& solver_ const solver_type& solver_
) )
{ {
wait_for_thread_to_pause();
solvers = solver_; solvers = solver_;
} }
...@@ -102,69 +120,61 @@ namespace dlib ...@@ -102,69 +120,61 @@ namespace dlib
const sstack<solver_type,net_type::num_layers>& get_solvers ( const sstack<solver_type,net_type::num_layers>& get_solvers (
) const { return solvers; } ) const
{
wait_for_thread_to_pause();
return solvers;
}
sstack<solver_type,net_type::num_layers>& get_solvers ( sstack<solver_type,net_type::num_layers>& get_solvers (
) { return solvers; } )
{
wait_for_thread_to_pause();
return solvers;
}
const net_type& train (
void train_one_step (
const std::vector<input_type>& data, const std::vector<input_type>& data,
const std::vector<label_type>& labels const std::vector<label_type>& labels
) )
{ {
DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, ""); job.labels = labels;
net.to_tensor(data.begin(), data.end(), job.t);
job_pipe.enqueue(job);
}
resizable_tensor t1, t2; void train_one_step (
const std::vector<input_type>& data
)
{
net.to_tensor(data.begin(), data.end(), job.t);
job_pipe.enqueue(job);
}
const net_type& train (
const std::vector<input_type>& data,
const std::vector<label_type>& labels
)
{
DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, "");
console_progress_indicator pbar(num_epochs); console_progress_indicator pbar(num_epochs);
pbar.print_status(0); pbar.print_status(0);
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration) for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
{ {
running_stats<double> rs;
size_t j = 0;
// Load two tensors worth of data at once so we can overlap the computation
// and data transfer between the host and the device.
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t1);
j += mini_batch_size;
}
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
size_t i = 0;
using namespace std::chrono; using namespace std::chrono;
auto last_time = system_clock::now(); auto last_time = system_clock::now();
while (i < data.size()) clear_average_loss();
for (size_t i = 0; i < data.size(); i += mini_batch_size)
{ {
rs.add(net.update(t1, labels.begin()+i, solvers)); net.to_tensor(data.begin()+i,
i += mini_batch_size; data.begin()+std::min(i+mini_batch_size,data.size()),
if (j < data.size()) job.t);
{ job.labels.assign(labels.begin()+i,
net.to_tensor(data.begin()+j, labels.begin()+std::min(i+mini_batch_size,data.size()));
data.begin()+std::min(j+mini_batch_size,data.size()), t1); job_pipe.enqueue(job);
j += mini_batch_size;
}
if (i < data.size())
{
rs.add(net.update(t2, labels.begin()+i, solvers));
i += mini_batch_size;
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
}
if (verbose) if (verbose)
{ {
...@@ -174,7 +184,7 @@ namespace dlib ...@@ -174,7 +184,7 @@ namespace dlib
last_time = now_time; last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size(); auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " "; << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(iter, true); pbar.print_status(iter, true);
std::cout << std::endl; std::cout << std::endl;
} }
...@@ -186,12 +196,12 @@ namespace dlib ...@@ -186,12 +196,12 @@ namespace dlib
// Capitalize the E in Epoch so it's easy to grep out the lines that // Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements. // are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " "; << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(epoch_iteration+1, true); pbar.print_status(epoch_iteration+1, true);
std::cout << std::endl; std::cout << std::endl;
} }
} }
return net; return get_net();
} }
const net_type& train ( const net_type& train (
...@@ -204,55 +214,20 @@ namespace dlib ...@@ -204,55 +214,20 @@ namespace dlib
static_assert(has_unsupervised_loss, static_assert(has_unsupervised_loss,
"You can only call this version of train() when using an unsupervised loss."); "You can only call this version of train() when using an unsupervised loss.");
resizable_tensor t1, t2;
console_progress_indicator pbar(num_epochs); console_progress_indicator pbar(num_epochs);
pbar.print_status(0); pbar.print_status(0);
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration) for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
{ {
running_stats<double> rs;
size_t j = 0;
// Load two tensors worth of data at once so we can overlap the computation
// and data transfer between the host and the device.
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t1);
j += mini_batch_size;
}
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
size_t i = 0;
using namespace std::chrono; using namespace std::chrono;
auto last_time = system_clock::now(); auto last_time = system_clock::now();
while (i < data.size()) clear_average_loss();
{ for (size_t i = 0; i < data.size(); i += mini_batch_size)
rs.add(net.update(t1, solvers));
i += mini_batch_size;
if (j < data.size())
{ {
net.to_tensor(data.begin()+j, net.to_tensor(data.begin()+i,
data.begin()+std::min(j+mini_batch_size,data.size()), t1); data.begin()+std::min(i+mini_batch_size,data.size()),
j += mini_batch_size; job.t);
} job_pipe.enqueue(job);
if (i < data.size())
{
rs.add(net.update(t2, solvers));
i += mini_batch_size;
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
}
if (verbose) if (verbose)
{ {
...@@ -262,7 +237,7 @@ namespace dlib ...@@ -262,7 +237,7 @@ namespace dlib
last_time = now_time; last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size(); auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " "; << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(iter, true); pbar.print_status(iter, true);
std::cout << std::endl; std::cout << std::endl;
} }
...@@ -274,18 +249,20 @@ namespace dlib ...@@ -274,18 +249,20 @@ namespace dlib
// Capitalize the E in Epoch so it's easy to grep out the lines that // Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements. // are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " "; << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(epoch_iteration+1, true); pbar.print_status(epoch_iteration+1, true);
std::cout << std::endl; std::cout << std::endl;
} }
} }
return net; return get_net();
} }
friend void serialize(const dnn_trainer& item, std::ostream& out) friend void serialize(const dnn_trainer& item, std::ostream& out)
{ {
item.wait_for_thread_to_pause();
int version = 1; int version = 1;
serialize(version, out); serialize(version, out);
serialize(item.rs, out);
serialize(item.num_epochs, out); serialize(item.num_epochs, out);
serialize(item.mini_batch_size, out); serialize(item.mini_batch_size, out);
serialize(item.verbose, out); serialize(item.verbose, out);
...@@ -295,10 +272,12 @@ namespace dlib ...@@ -295,10 +272,12 @@ namespace dlib
friend void deserialize(dnn_trainer& item, std::istream& in) friend void deserialize(dnn_trainer& item, std::istream& in)
{ {
item.wait_for_thread_to_pause();
int version = 0; int version = 0;
deserialize(version, in); deserialize(version, in);
if (version != 1) if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer."); throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
deserialize(item.rs, in);
deserialize(item.num_epochs, in); deserialize(item.num_epochs, in);
deserialize(item.mini_batch_size, in); deserialize(item.mini_batch_size, in);
deserialize(item.verbose, in); deserialize(item.verbose, in);
...@@ -306,7 +285,58 @@ namespace dlib ...@@ -306,7 +285,58 @@ namespace dlib
deserialize(item.solvers, in); deserialize(item.solvers, in);
} }
double get_average_loss (
) const
{
wait_for_thread_to_pause();
return rs.mean();
}
void clear_average_loss (
)
{
wait_for_thread_to_pause();
rs.clear();
}
private: private:
struct job_t
{
std::vector<label_type> labels;
resizable_tensor t;
};
template <typename T>
void run_update(job_t& next_job, const T&)
{
rs.add(net.update(next_job.t, next_job.labels.begin(), solvers));
}
void run_update(job_t& next_job, const no_label_type&)
{
no_label_type pick_wich_run_update;
rs.add(net.update(next_job.t, solvers));
}
void thread()
{
// Make sure this thread uses the same cuda device as the thread that created
// the dnn_trainer object.
dlib::cuda::set_device(cuda_device_id);
label_type pick_wich_run_update;
job_t next_job;
while(job_pipe.dequeue(next_job))
{
// call net.update() but pick the right version for unsupervised or
// supervised training based on the type of label_type.
run_update(next_job, pick_wich_run_update);
}
}
void wait_for_thread_to_pause() const
{
job_pipe.wait_for_num_blocked_dequeues(1);
}
const static long string_pad = 10; const static long string_pad = 10;
...@@ -315,11 +345,20 @@ namespace dlib ...@@ -315,11 +345,20 @@ namespace dlib
num_epochs = 300; num_epochs = 300;
mini_batch_size = 11; mini_batch_size = 11;
verbose = false; verbose = false;
cuda_device_id = dlib::cuda::get_device();
start();
} }
// The job object is not logically part of the state of this object. It is here
// only to avoid reallocating it over and over.
job_t job;
dlib::pipe<job_t> job_pipe;
running_stats<double> rs;
unsigned long num_epochs; unsigned long num_epochs;
size_t mini_batch_size; size_t mini_batch_size;
bool verbose; bool verbose;
int cuda_device_id;
net_type net; net_type net;
sstack<solver_type,net_type::num_layers> solvers; sstack<solver_type,net_type::num_layers> solvers;
......
...@@ -192,6 +192,8 @@ namespace dlib ...@@ -192,6 +192,8 @@ namespace dlib
and having it execute 10 epochs of training during each call. This also and having it execute 10 epochs of training during each call. This also
means you can serialize a trainer to disk and then, at a later date, means you can serialize a trainer to disk and then, at a later date,
deserialize it and resume training your network where you left off. deserialize it and resume training your network where you left off.
- You can obtain the average loss value during the final training epoch by
calling get_average_loss().
!*/ !*/
const net_type& train ( const net_type& train (
...@@ -218,6 +220,67 @@ namespace dlib ...@@ -218,6 +220,67 @@ namespace dlib
and having it execute 10 epochs of training during each call. This also and having it execute 10 epochs of training during each call. This also
means you can serialize a trainer to disk and then, at a later date, means you can serialize a trainer to disk and then, at a later date,
deserialize it and resume training your network where you left off. deserialize it and resume training your network where you left off.
- You can obtain the average loss value during the final training epoch by
calling get_average_loss().
!*/
void train_one_step (
const std::vector<input_type>& data,
const std::vector<label_type>& labels
);
/*!
requires
- data.size() == labels.size()
- net_type uses a supervised loss.
i.e. net_type::label_type != no_label_type.
ensures
- Performs one stochastic gradient update step based on the mini-batch of
data and labels supplied to this function. In particular, calling
train_one_step() in a loop is equivalent to calling the train() method
defined above. However, train_one_step() allows you to stream data from
disk into the training process while train() requires you to first load
all the training data into RAM. Otherwise, these training methods are
equivalent.
- You can observe the current average loss value by calling get_average_loss().
!*/
void train_one_step (
const std::vector<input_type>& data
);
/*!
requires
- net_type uses an unsupervised loss.
i.e. net_type::label_type == no_label_type.
ensures
- Performs one stochastic gradient update step based on the mini-batch of
data supplied to this function. In particular, calling train_one_step()
in a loop is equivalent to calling the train() method defined above.
However, train_one_step() allows you to stream data from disk into the
training process while train() requires you to first load all the
training data into RAM. Otherwise, these training methods are
equivalent.
- You can observe the current average loss value by calling get_average_loss().
!*/
double get_average_loss (
) const;
/*!
ensures
- returns the average loss value observed during previous calls to
train_one_step() or train(). That is, the average output of
net_type::update() during the previous mini-batch updates.
!*/
void clear_average_loss (
);
/*!
ensures
- #get_average_loss() == 0
- get_average_loss() uses a dlib::running_stats object to keep a running
average of the loss values seen during the previous mini-batch updates
applied during training. Calling clear_average_loss() resets the
running_stats object so it forgets about all previous loss values
observed.
!*/ !*/
}; };
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment