Commit 168574bd authored by Davis King's avatar Davis King

Added visit_layer_parameter_gradients() and also fixed a silly synchronization

error in the multi-gpu training code.
parent d31723ff
...@@ -3049,6 +3049,73 @@ namespace dlib ...@@ -3049,6 +3049,73 @@ namespace dlib
impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v); impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v);
} }
// ----------------------------------------------------------------------------------------
namespace impl
{
template <size_t i, size_t num>
struct vlpg_loop
{
template <typename T, typename U>
static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
{
// intentionally left empty
}
template <typename T, typename U>
static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
{
v(comp_i, l.get_parameter_gradient());
++comp_i;
}
template <
typename net_type,
typename visitor
>
static void visit(
size_t comp_i,
net_type& net,
visitor&& v
)
{
invoke_functor(v, comp_i, layer<i>(net));
vlpg_loop<i+1, num>::visit(comp_i, net,v);
}
};
template <size_t num>
struct vlpg_loop<num,num>
{
template <
typename net_type,
typename visitor
>
static void visit(
size_t,
net_type&,
visitor&&
)
{
// Base case of recursion. Don't do anything.
}
};
}
template <
typename net_type,
typename visitor
>
void visit_layer_parameter_gradients(
net_type& net,
visitor v
)
{
size_t comp_i = 0;
impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v);
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
} }
......
...@@ -1348,6 +1348,40 @@ namespace dlib ...@@ -1348,6 +1348,40 @@ namespace dlib
- When v() is called, the first argument is always < net_type::num_computational_layers. - When v() is called, the first argument is always < net_type::num_computational_layers.
!*/ !*/
// ----------------------------------------------------------------------------------------
template <
typename net_type,
typename visitor
>
void visit_layer_parameter_gradients(
net_type& net,
visitor v
);
/*!
requires
- net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
add_tag_layer.
- v is a function object with a signature equivalent to:
v(size_t idx, tensor& t)
ensures
- Loops over all the computational layers (i.e. layers with parameters, as
opposed to loss, tag, or input layers) in net and passes their parameter
gradients to v(). To be specific, this function essentially performs the
following:
size_t computational_layer_idx = 0;
for (size_t i = 0; i < net_type::num_layers; ++i)
{
if (layer<i>(net) is a computational layer)
{
v(computational_layer_idx, layer<i>(net).get_parameter_gradient());
++computational_layer_idx;
}
}
- When v() is called, the first argument is always < net_type::num_computational_layers.
!*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
struct layer_test_results struct layer_test_results
......
...@@ -501,9 +501,12 @@ namespace dlib ...@@ -501,9 +501,12 @@ namespace dlib
std::vector<std::future<double>> losses(devices.size()); std::vector<std::future<double>> losses(devices.size());
std::vector<std::future<void>> update_futs(devices.size()); std::vector<std::future<void>> update_futs(devices.size());
std::vector<matrix<float>> param_buffer(net_type::num_computational_layers); std::vector<matrix<float>> param_buffer(net_type::num_computational_layers);
std::vector<matrix<float>> param_grad_buffer(net_type::num_computational_layers);
size_t iteration = 0;
while(job_pipe.dequeue(next_job)) while(job_pipe.dequeue(next_job))
{ {
++iteration;
// Call compute_parameter_gradients() and update_parameters() but pick the // Call compute_parameter_gradients() and update_parameters() but pick the
// right version for unsupervised or supervised training based on the type // right version for unsupervised or supervised training based on the type
// of label_type. // of label_type.
...@@ -517,17 +520,45 @@ namespace dlib ...@@ -517,17 +520,45 @@ namespace dlib
// gradient updates between devices. So we do that now. // gradient updates between devices. So we do that now.
if (devices.size() > 1) if (devices.size() > 1)
{ {
for (auto&& p : param_buffer) for (auto&& p : param_grad_buffer)
p = 0; p = 0;
// now average all the parameter gradients // now average all the parameter gradients
for (size_t i = 0; i < devices.size(); ++i) for (size_t i = 0; i < devices.size(); ++i)
{
visit_layer_parameter_gradients(devices[i]->net, [&param_grad_buffer](size_t j, tensor& t) {
if (t.size() != 0)
param_grad_buffer[j] += mat(t);
});
}
// and then assign the parameter gradients back to all the networks
const float scale = 1.0f/devices.size();
for (size_t i = 0; i < devices.size(); ++i)
{
visit_layer_parameter_gradients(devices[i]->net, [scale,&param_grad_buffer](size_t j, tensor& t) {
if (t.size() != 0)
{
t = param_grad_buffer[j]*scale;
t.async_copy_to_device();
}
});
}
// Evey now and then force all the parameters to be the same just to
// make sure they aren't drifting apart due to any non-deterministic
// behavior on the GPU.
if (iteration%5000 == 1)
{
for (auto&& p : param_buffer)
p = 0;
// now average all the parameters
for (size_t i = 0; i < devices.size(); ++i)
{ {
visit_layer_parameters(devices[i]->net, [&param_buffer](size_t j, tensor& t) { visit_layer_parameters(devices[i]->net, [&param_buffer](size_t j, tensor& t) {
if (t.size() != 0) if (t.size() != 0)
param_buffer[j] += mat(t); param_buffer[j] += mat(t);
}); });
} }
// and then assign the parameter gradients back to all the networks // and then assign the parameters back to all the networks.
const float scale = 1.0f/devices.size(); const float scale = 1.0f/devices.size();
for (size_t i = 0; i < devices.size(); ++i) for (size_t i = 0; i < devices.size(); ++i)
{ {
...@@ -540,6 +571,7 @@ namespace dlib ...@@ -540,6 +571,7 @@ namespace dlib
}); });
} }
} }
}
// Now apply all the updates to each device. // Now apply all the updates to each device.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment