Commit 9260243c authored by Davis King's avatar Davis King

Fixed bugs in multi-gpu training code.

parent 26d2d889
...@@ -501,28 +501,12 @@ namespace dlib ...@@ -501,28 +501,12 @@ namespace dlib
std::vector<std::future<double>> losses(devices.size()); std::vector<std::future<double>> losses(devices.size());
std::vector<std::future<void>> update_futs(devices.size()); std::vector<std::future<void>> update_futs(devices.size());
std::vector<tt::multi_device_tensor_averager> averagers(net_type::num_computational_layers); std::vector<tt::multi_device_tensor_averager> averagers;
if (devices.size() > 1) // An array of all the parameter tensors in the first network. We will
{ // periodically copy these tensors to all the other devices to make sure the
// setup the averagers to point to the tensors in the networks. // different GPUs don't go out of sync.
std::vector<std::vector<tensor*>> all_tensors(devices.size()); std::vector<tensor*> reference_params;
for (size_t i = 0; i < all_tensors.size(); ++i) visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
{
all_tensors[i].resize(net_type::num_computational_layers);
visit_layer_parameter_gradients(devices[i]->net, [&](size_t j, tensor& t){
all_tensors[i][j] = &t;
});
}
// Now set each averager to average the tensors at the same layer in each
// network.
for (size_t i = 0; i < net_type::num_computational_layers; ++i)
{
std::vector<tensor*> temp(all_tensors.size());
for (size_t j = 0; j < all_tensors.size(); ++j)
temp[j] = all_tensors[j][i];
averagers[i].set(temp);
}
}
size_t iteration = 0; size_t iteration = 0;
...@@ -544,48 +528,41 @@ namespace dlib ...@@ -544,48 +528,41 @@ namespace dlib
// gradient updates between devices. So we do that now. // gradient updates between devices. So we do that now.
if (devices.size() > 1) if (devices.size() > 1)
{ {
for (auto&& d : devices) // if this is the first iteration then we need to setup the averagers.
cuda::device_synchronize(d->device_id); // We can't do this outside the loop because the tensors that get
// averaged need to be allocated to their devices before we call set()
for (auto&& avg : averagers) // so that the averagers can determine how best to average them.
avg.average(); if (averagers.size() == 0)
/*
for (auto&& d : devices)
cuda::device_synchronize(d->device_id);
*/
// Evey now and then force all the parameters to be the same just to
// make sure they aren't drifting apart due to any non-deterministic
// behavior on the GPU.
/*
if (iteration%5000 == 1)
{ {
for (auto&& p : param_buffer) averagers = std::vector<tt::multi_device_tensor_averager>(net_type::num_computational_layers);
p = 0; // setup the averagers to point to the tensors in the networks.
// now average all the parameters std::vector<std::vector<tensor*>> all_tensors(devices.size());
for (size_t i = 0; i < devices.size(); ++i) for (size_t i = 0; i < all_tensors.size(); ++i)
{ {
visit_layer_parameters(devices[i]->net, [&param_buffer](size_t j, tensor& t) { all_tensors[i].resize(net_type::num_computational_layers);
if (t.size() != 0) visit_layer_parameter_gradients(devices[i]->net, [&](size_t j, tensor& t){
param_buffer[j] += mat(t); all_tensors[i][j] = &t;
}); });
} }
// and then assign the parameters back to all the networks. // Now set each averager to average the tensors at the same layer in each
const float scale = 1.0f/devices.size(); // network.
for (size_t i = 0; i < devices.size(); ++i) for (size_t i = 0; i < net_type::num_computational_layers; ++i)
{
visit_layer_parameters(devices[i]->net, [scale,&param_buffer](size_t j, tensor& t) {
if (t.size() != 0)
{ {
t = param_buffer[j]*scale; std::vector<tensor*> temp(all_tensors.size());
t.async_copy_to_device(); for (size_t j = 0; j < all_tensors.size(); ++j)
} temp[j] = all_tensors[j][i];
}); // ignore layers that don't have parameters
if (temp[0]->size() != 0)
averagers[i].set(temp);
} }
} }
*/
for (auto&& d : devices)
cuda::device_synchronize(d->device_id);
for (auto&& avg : averagers)
avg.average();
} }
...@@ -597,6 +574,23 @@ namespace dlib ...@@ -597,6 +574,23 @@ namespace dlib
f.wait(); f.wait();
// Evey now and then force all the parameters to be the same just to make
// sure they aren't drifting apart due to any non-deterministic behavior on
// the GPU. It's also important to do this on the first iteration because
// the different networks may be initialized differently when tensor data
// is first passed through them. So this code block deals with these
// issues.
if (devices.size() > 1 && iteration%2000 == 1)
{
for (size_t i = 1; i < devices.size(); ++i)
{
visit_layer_parameters(devices[i]->net, [&](size_t j, tensor& t)
{
memcpy(t, *reference_params[j]);
});
}
}
// If we have been running for a while then check if the loss is still // If we have been running for a while then check if the loss is still
// dropping. If it isn't then we will reduce the step size. Note that we // dropping. If it isn't then we will reduce the step size. Note that we
// have a "budget" that prevents us from calling // have a "budget" that prevents us from calling
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment