Commit 3b75b335 authored by Davis King's avatar Davis King

Gave dnn_trainer the ability to train on out of core data by adding the

train_one_step() member function.  Also improved how the host to device transfers
are overlapped with kernel computation.
parent adec3eef
......@@ -1284,8 +1284,9 @@ namespace dlib
// "no label". So here we make the constructor private with the exception that
// add_loss_layer objects can make it (again, just to simplify add_loss_layer's
// implementation).
no_label_type()=default;
no_label_type(){};
template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
template < typename net_type, typename solver_type > friend class dnn_trainer;
};
// ----------------------------------------------------------------------------------------
......
......@@ -10,6 +10,23 @@ namespace dlib
namespace cuda
{
// -----------------------------------------------------------------------------------
void set_device (
int dev
)
{
CHECK_CUDA(cudaSetDevice(dev));
}
int get_device (
)
{
int dev = 0;
CHECK_CUDA(cudaGetDevice(&dev));
return dev;
}
// -----------------------------------------------------------------------------------
__global__ void _cuda_multiply(float* d, const float* s, size_t n)
......
......@@ -3,7 +3,6 @@
#ifndef DLIB_DNN_CuDA_H_
#define DLIB_DNN_CuDA_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h"
......@@ -12,6 +11,17 @@ namespace dlib
namespace cuda
{
#ifdef DLIB_USE_CUDA
// ----------------------------------------------------------------------------------------
void set_device (
int dev
);
int get_device (
);
// -----------------------------------------------------------------------------------
void multiply (
......@@ -120,11 +130,24 @@ namespace dlib
);
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
#else // if DLIB_USE_CUDA NOT DEFINED
inline void set_device (
int dev
){}
inline int get_device (
){}
#endif // DLIB_USE_CUDA
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDA_H_
......@@ -11,6 +11,10 @@
#include <chrono>
#include "../serialize.h"
#include "../pipe.h"
#include "../threads.h"
#include "cuda_dlib.h"
namespace dlib
{
......@@ -20,7 +24,7 @@ namespace dlib
typename net_type,
typename solver_type = sgd
>
class dnn_trainer
class dnn_trainer : private threaded_object
{
public:
......@@ -31,12 +35,12 @@ namespace dlib
typedef typename net_type::input_type input_type;
dnn_trainer(
)
) : job_pipe(0)
{
init();
}
explicit dnn_trainer(const net_type& net_) : net(net_)
explicit dnn_trainer(const net_type& net_) : job_pipe(0), net(net_)
{
init();
}
......@@ -44,18 +48,31 @@ namespace dlib
dnn_trainer(
const net_type& net_,
const solver_type& solver_
) : net(net_), solvers(solver_)
) : job_pipe(0), net(net_), solvers(solver_)
{
init();
}
~dnn_trainer(
)
{
job_pipe.disable();
stop();
wait();
}
const net_type& get_net (
) const { return net; }
) const
{
wait_for_thread_to_pause();
return net;
}
void set_net (
const net_type& net_
)
{
wait_for_thread_to_pause();
return net = net_;
}
......@@ -63,6 +80,7 @@ namespace dlib
const solver_type& solver_
)
{
wait_for_thread_to_pause();
solvers = solver_;
}
......@@ -102,10 +120,37 @@ namespace dlib
const sstack<solver_type,net_type::num_layers>& get_solvers (
) const { return solvers; }
) const
{
wait_for_thread_to_pause();
return solvers;
}
sstack<solver_type,net_type::num_layers>& get_solvers (
) { return solvers; }
)
{
wait_for_thread_to_pause();
return solvers;
}
void train_one_step (
const std::vector<input_type>& data,
const std::vector<label_type>& labels
)
{
job.labels = labels;
net.to_tensor(data.begin(), data.end(), job.t);
job_pipe.enqueue(job);
}
void train_one_step (
const std::vector<input_type>& data
)
{
net.to_tensor(data.begin(), data.end(), job.t);
job_pipe.enqueue(job);
}
const net_type& train (
const std::vector<input_type>& data,
......@@ -114,57 +159,22 @@ namespace dlib
{
DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, "");
resizable_tensor t1, t2;
console_progress_indicator pbar(num_epochs);
pbar.print_status(0);
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
{
running_stats<double> rs;
size_t j = 0;
// Load two tensors worth of data at once so we can overlap the computation
// and data transfer between the host and the device.
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t1);
j += mini_batch_size;
}
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
size_t i = 0;
using namespace std::chrono;
auto last_time = system_clock::now();
while (i < data.size())
clear_average_loss();
for (size_t i = 0; i < data.size(); i += mini_batch_size)
{
rs.add(net.update(t1, labels.begin()+i, solvers));
i += mini_batch_size;
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t1);
j += mini_batch_size;
}
net.to_tensor(data.begin()+i,
data.begin()+std::min(i+mini_batch_size,data.size()),
job.t);
job.labels.assign(labels.begin()+i,
labels.begin()+std::min(i+mini_batch_size,data.size()));
job_pipe.enqueue(job);
if (i < data.size())
{
rs.add(net.update(t2, labels.begin()+i, solvers));
i += mini_batch_size;
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
}
if (verbose)
{
......@@ -174,7 +184,7 @@ namespace dlib
last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " ";
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(iter, true);
std::cout << std::endl;
}
......@@ -186,12 +196,12 @@ namespace dlib
// Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " ";
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(epoch_iteration+1, true);
std::cout << std::endl;
}
}
return net;
return get_net();
}
const net_type& train (
......@@ -204,55 +214,20 @@ namespace dlib
static_assert(has_unsupervised_loss,
"You can only call this version of train() when using an unsupervised loss.");
resizable_tensor t1, t2;
console_progress_indicator pbar(num_epochs);
pbar.print_status(0);
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
{
running_stats<double> rs;
size_t j = 0;
// Load two tensors worth of data at once so we can overlap the computation
// and data transfer between the host and the device.
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t1);
j += mini_batch_size;
}
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
size_t i = 0;
using namespace std::chrono;
auto last_time = system_clock::now();
while (i < data.size())
clear_average_loss();
for (size_t i = 0; i < data.size(); i += mini_batch_size)
{
rs.add(net.update(t1, solvers));
i += mini_batch_size;
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t1);
j += mini_batch_size;
}
net.to_tensor(data.begin()+i,
data.begin()+std::min(i+mini_batch_size,data.size()),
job.t);
job_pipe.enqueue(job);
if (i < data.size())
{
rs.add(net.update(t2, solvers));
i += mini_batch_size;
if (j < data.size())
{
net.to_tensor(data.begin()+j,
data.begin()+std::min(j+mini_batch_size,data.size()), t2);
j += mini_batch_size;
}
}
if (verbose)
{
......@@ -262,7 +237,7 @@ namespace dlib
last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " ";
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(iter, true);
std::cout << std::endl;
}
......@@ -274,18 +249,20 @@ namespace dlib
// Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << " ";
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(epoch_iteration+1, true);
std::cout << std::endl;
}
}
return net;
return get_net();
}
friend void serialize(const dnn_trainer& item, std::ostream& out)
{
item.wait_for_thread_to_pause();
int version = 1;
serialize(version, out);
serialize(item.rs, out);
serialize(item.num_epochs, out);
serialize(item.mini_batch_size, out);
serialize(item.verbose, out);
......@@ -295,10 +272,12 @@ namespace dlib
friend void deserialize(dnn_trainer& item, std::istream& in)
{
item.wait_for_thread_to_pause();
int version = 0;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
deserialize(item.rs, in);
deserialize(item.num_epochs, in);
deserialize(item.mini_batch_size, in);
deserialize(item.verbose, in);
......@@ -306,7 +285,58 @@ namespace dlib
deserialize(item.solvers, in);
}
double get_average_loss (
) const
{
wait_for_thread_to_pause();
return rs.mean();
}
void clear_average_loss (
)
{
wait_for_thread_to_pause();
rs.clear();
}
private:
struct job_t
{
std::vector<label_type> labels;
resizable_tensor t;
};
template <typename T>
void run_update(job_t& next_job, const T&)
{
rs.add(net.update(next_job.t, next_job.labels.begin(), solvers));
}
void run_update(job_t& next_job, const no_label_type&)
{
no_label_type pick_wich_run_update;
rs.add(net.update(next_job.t, solvers));
}
void thread()
{
// Make sure this thread uses the same cuda device as the thread that created
// the dnn_trainer object.
dlib::cuda::set_device(cuda_device_id);
label_type pick_wich_run_update;
job_t next_job;
while(job_pipe.dequeue(next_job))
{
// call net.update() but pick the right version for unsupervised or
// supervised training based on the type of label_type.
run_update(next_job, pick_wich_run_update);
}
}
void wait_for_thread_to_pause() const
{
job_pipe.wait_for_num_blocked_dequeues(1);
}
const static long string_pad = 10;
......@@ -315,11 +345,20 @@ namespace dlib
num_epochs = 300;
mini_batch_size = 11;
verbose = false;
cuda_device_id = dlib::cuda::get_device();
start();
}
// The job object is not logically part of the state of this object. It is here
// only to avoid reallocating it over and over.
job_t job;
dlib::pipe<job_t> job_pipe;
running_stats<double> rs;
unsigned long num_epochs;
size_t mini_batch_size;
bool verbose;
int cuda_device_id;
net_type net;
sstack<solver_type,net_type::num_layers> solvers;
......
......@@ -192,6 +192,8 @@ namespace dlib
and having it execute 10 epochs of training during each call. This also
means you can serialize a trainer to disk and then, at a later date,
deserialize it and resume training your network where you left off.
- You can obtain the average loss value during the final training epoch by
calling get_average_loss().
!*/
const net_type& train (
......@@ -218,6 +220,67 @@ namespace dlib
and having it execute 10 epochs of training during each call. This also
means you can serialize a trainer to disk and then, at a later date,
deserialize it and resume training your network where you left off.
- You can obtain the average loss value during the final training epoch by
calling get_average_loss().
!*/
void train_one_step (
const std::vector<input_type>& data,
const std::vector<label_type>& labels
);
/*!
requires
- data.size() == labels.size()
- net_type uses a supervised loss.
i.e. net_type::label_type != no_label_type.
ensures
- Performs one stochastic gradient update step based on the mini-batch of
data and labels supplied to this function. In particular, calling
train_one_step() in a loop is equivalent to calling the train() method
defined above. However, train_one_step() allows you to stream data from
disk into the training process while train() requires you to first load
all the training data into RAM. Otherwise, these training methods are
equivalent.
- You can observe the current average loss value by calling get_average_loss().
!*/
void train_one_step (
const std::vector<input_type>& data
);
/*!
requires
- net_type uses an unsupervised loss.
i.e. net_type::label_type == no_label_type.
ensures
- Performs one stochastic gradient update step based on the mini-batch of
data supplied to this function. In particular, calling train_one_step()
in a loop is equivalent to calling the train() method defined above.
However, train_one_step() allows you to stream data from disk into the
training process while train() requires you to first load all the
training data into RAM. Otherwise, these training methods are
equivalent.
- You can observe the current average loss value by calling get_average_loss().
!*/
double get_average_loss (
) const;
/*!
ensures
- returns the average loss value observed during previous calls to
train_one_step() or train(). That is, the average output of
net_type::update() during the previous mini-batch updates.
!*/
void clear_average_loss (
);
/*!
ensures
- #get_average_loss() == 0
- get_average_loss() uses a dlib::running_stats object to keep a running
average of the loss values seen during the previous mini-batch updates
applied during training. Calling clear_average_loss() resets the
running_stats object so it forgets about all previous loss values
observed.
!*/
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment