Commit c49631dd authored by Davis King's avatar Davis King

Changed tensor layout from NHWC to NCHW since this layout is much faster for

cuDNN.
parent 30c4ee54
...@@ -1362,7 +1362,7 @@ namespace dlib ...@@ -1362,7 +1362,7 @@ namespace dlib
namespace timpl namespace timpl
{ {
void fill_with_gassuan_random_numbers ( inline void fill_with_gassuan_random_numbers (
tensor& t, tensor& t,
dlib::rand& rnd, dlib::rand& rnd,
double sigma = 1 double sigma = 1
...@@ -1383,12 +1383,12 @@ namespace dlib ...@@ -1383,12 +1383,12 @@ namespace dlib
// Output and gradient_input have to have the same dimensions in each // Output and gradient_input have to have the same dimensions in each
// layer. // layer.
const long num_samples = rnd.get_random_32bit_number()%4+3; const long num_samples = rnd.get_random_32bit_number()%4+3;
const long k = rnd.get_random_32bit_number()%4+2;
const long nr = rnd.get_random_32bit_number()%4+2; const long nr = rnd.get_random_32bit_number()%4+2;
const long nc = rnd.get_random_32bit_number()%4+2; const long nc = rnd.get_random_32bit_number()%4+2;
const long k = rnd.get_random_32bit_number()%4+2;
output.set_size(num_samples, nr, nc, k); output.set_size(num_samples, k, nr, nc);
gradient_input.set_size(num_samples, nr, nc, k); gradient_input.set_size(num_samples, k, nr, nc);
// Use a non-zero initial gradient to make sure the layers add to it // Use a non-zero initial gradient to make sure the layers add to it
// rather than assign and blow away the initial value. // rather than assign and blow away the initial value.
...@@ -1447,7 +1447,8 @@ namespace dlib ...@@ -1447,7 +1447,8 @@ namespace dlib
}; };
void print_tensor( // TODO, remove?
inline void print_tensor(
const tensor& a const tensor& a
) )
{ {
......
...@@ -85,9 +85,9 @@ namespace dlib ...@@ -85,9 +85,9 @@ namespace dlib
void tensor_descriptor:: void tensor_descriptor::
set_size( set_size(
int n, int n,
int k,
int nr, int nr,
int nc, int nc
int k
) )
{ {
if (n == 0 || nr == 0 || nc == 0 || k == 0) if (n == 0 || nr == 0 || nc == 0 || k == 0)
...@@ -105,7 +105,7 @@ namespace dlib ...@@ -105,7 +105,7 @@ namespace dlib
handle = h; handle = h;
check(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle, check(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT,
n, n,
k, k,
...@@ -117,9 +117,9 @@ namespace dlib ...@@ -117,9 +117,9 @@ namespace dlib
void tensor_descriptor:: void tensor_descriptor::
get_size ( get_size (
int& n, int& n,
int& nr, int& k,
int& nc, int& nr,
int& k int& nc
) const ) const
{ {
if (handle) if (handle)
...@@ -140,9 +140,9 @@ namespace dlib ...@@ -140,9 +140,9 @@ namespace dlib
else else
{ {
n = 0; n = 0;
k = 0;
nr = 0; nr = 0;
nc = 0; nc = 0;
k = 0;
} }
} }
...@@ -254,7 +254,7 @@ namespace dlib ...@@ -254,7 +254,7 @@ namespace dlib
&out_nc)); &out_nc));
tensor_descriptor dest_desc; tensor_descriptor dest_desc;
dest_desc.set_size(out_num_samples,out_nr,out_nc,out_k); dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc);
cudnnConvolutionFwdAlgo_t forward_best_algo; cudnnConvolutionFwdAlgo_t forward_best_algo;
check(cudnnGetConvolutionForwardAlgorithm( check(cudnnGetConvolutionForwardAlgorithm(
...@@ -299,7 +299,7 @@ namespace dlib ...@@ -299,7 +299,7 @@ namespace dlib
const tensor& filters const tensor& filters
) )
{ {
output.set_size(out_num_samples, out_nr, out_nc, out_k); output.set_size(out_num_samples, out_k, out_nr, out_nc);
// TODO, remove // TODO, remove
......
...@@ -37,9 +37,9 @@ namespace dlib ...@@ -37,9 +37,9 @@ namespace dlib
void set_size( void set_size(
int n, int n,
int k,
int nr, int nr,
int nc, int nc
int k
); );
/*! /*!
ensures ensures
...@@ -48,9 +48,9 @@ namespace dlib ...@@ -48,9 +48,9 @@ namespace dlib
void get_size ( void get_size (
int& n, int& n,
int& nr, int& k,
int& nc, int& nr,
int& k int& nc
) const; ) const;
const void* get_handle ( const void* get_handle (
...@@ -209,9 +209,9 @@ namespace dlib ...@@ -209,9 +209,9 @@ namespace dlib
// dimensions of the output tensor from operator() // dimensions of the output tensor from operator()
int out_num_samples; int out_num_samples;
int out_k;
int out_nr; int out_nr;
int out_nc; int out_nc;
int out_k;
int forward_algo; int forward_algo;
size_t forward_workspace_size_in_bytes; size_t forward_workspace_size_in_bytes;
......
...@@ -56,8 +56,9 @@ namespace dlib ...@@ -56,8 +56,9 @@ namespace dlib
// initialize data to the right size to contain the stuff in the iterator range. // initialize data to the right size to contain the stuff in the iterator range.
data.set_size(std::distance(ibegin,iend), nr, nc, pixel_traits<T>::num); data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
const size_t offset = nr*nc;
auto ptr = data.host(); auto ptr = data.host();
for (auto i = ibegin; i != iend; ++i) for (auto i = ibegin; i != iend; ++i)
{ {
...@@ -66,10 +67,15 @@ namespace dlib ...@@ -66,10 +67,15 @@ namespace dlib
for (long c = 0; c < nc; ++c) for (long c = 0; c < nc; ++c)
{ {
auto temp = pixel_to_vector<float>((*i)(r,c)); auto temp = pixel_to_vector<float>((*i)(r,c));
auto p = ptr++;
for (long j = 0; j < temp.size(); ++j) for (long j = 0; j < temp.size(); ++j)
*ptr++ = temp(j); {
*p = temp(j);
p += offset;
}
} }
} }
ptr += offset*(data.k()-1);
} }
} }
...@@ -123,8 +129,9 @@ namespace dlib ...@@ -123,8 +129,9 @@ namespace dlib
// initialize data to the right size to contain the stuff in the iterator range. // initialize data to the right size to contain the stuff in the iterator range.
data.set_size(std::distance(ibegin,iend), nr, nc, pixel_traits<T>::num); data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
const size_t offset = nr*nc;
auto ptr = data.host(); auto ptr = data.host();
for (auto i = ibegin; i != iend; ++i) for (auto i = ibegin; i != iend; ++i)
{ {
...@@ -133,10 +140,15 @@ namespace dlib ...@@ -133,10 +140,15 @@ namespace dlib
for (long c = 0; c < nc; ++c) for (long c = 0; c < nc; ++c)
{ {
auto temp = pixel_to_vector<float>((*i)[r][c]); auto temp = pixel_to_vector<float>((*i)[r][c]);
auto p = ptr++;
for (long j = 0; j < temp.size(); ++j) for (long j = 0; j < temp.size(); ++j)
*ptr++ = temp(j); {
*p = temp(j);
p += offset;
}
} }
} }
ptr += offset*(data.k()-1);
} }
} }
......
...@@ -83,7 +83,7 @@ namespace dlib ...@@ -83,7 +83,7 @@ namespace dlib
template <typename SUBNET> template <typename SUBNET>
void forward(const SUBNET& sub, resizable_tensor& output) void forward(const SUBNET& sub, resizable_tensor& output)
{ {
output.set_size(sub.get_output().num_samples(), 1,1,num_outputs); output.set_size(sub.get_output().num_samples(), num_outputs);
output = mat(sub.get_output())*mat(params); output = mat(sub.get_output())*mat(params);
} }
......
...@@ -22,30 +22,22 @@ namespace dlib ...@@ -22,30 +22,22 @@ namespace dlib
tensor ( tensor (
) : ) :
m_n(0), m_nr(0), m_nc(0), m_k(0) m_n(0), m_k(0), m_nr(0), m_nc(0)
{ {
} }
inline virtual ~tensor() = 0; inline virtual ~tensor() = 0;
long num_samples() const { return m_n; } long num_samples() const { return m_n; }
long k() const { return m_k; }
long nr() const { return m_nr; } long nr() const { return m_nr; }
long nc() const { return m_nc; } long nc() const { return m_nc; }
long k() const { return m_k; }
size_t size() const { return data.size(); } size_t size() const { return data.size(); }
void async_copy_to_device() void async_copy_to_device()
{ {
data.async_copy_to_device(); data.async_copy_to_device();
} }
/*!
ensures
- begin asynchronously copying this tensor to the GPU.
NOTE that the "get device pointer" routine in this class
will have to do some kind of synchronization that ensures
the copy is finished.
!*/
const float* host() const { return data.host(); } const float* host() const { return data.host(); }
float* host() { return data.host(); } float* host() { return data.host(); }
...@@ -135,13 +127,13 @@ namespace dlib ...@@ -135,13 +127,13 @@ namespace dlib
tensor& operator= (const tensor& item) tensor& operator= (const tensor& item)
{ {
m_n = item.m_n; m_n = item.m_n;
m_k = item.m_k;
m_nr = item.m_nr; m_nr = item.m_nr;
m_nc = item.m_nc; m_nc = item.m_nc;
m_k = item.m_k;
data.set_size(item.data.size()); data.set_size(item.data.size());
std::memcpy(data.host(), item.data.host(), data.size()*sizeof(float)); std::memcpy(data.host(), item.data.host(), data.size()*sizeof(float));
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cudnn_descriptor.set_size(m_n,m_nr,m_nc,m_k); cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
#endif #endif
return *this; return *this;
} }
...@@ -159,9 +151,9 @@ namespace dlib ...@@ -159,9 +151,9 @@ namespace dlib
void swap(tensor& item) void swap(tensor& item)
{ {
std::swap(m_n, item.m_n); std::swap(m_n, item.m_n);
std::swap(m_k, item.m_k);
std::swap(m_nr, item.m_nr); std::swap(m_nr, item.m_nr);
std::swap(m_nc, item.m_nc); std::swap(m_nc, item.m_nc);
std::swap(m_k, item.m_k);
std::swap(data, item.data); std::swap(data, item.data);
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
std::swap(cudnn_descriptor, item.cudnn_descriptor); std::swap(cudnn_descriptor, item.cudnn_descriptor);
...@@ -170,9 +162,9 @@ namespace dlib ...@@ -170,9 +162,9 @@ namespace dlib
long m_n; long m_n;
long m_k;
long m_nr; long m_nr;
long m_nc; long m_nc;
long m_k;
gpu_data data; gpu_data data;
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::tensor_descriptor cudnn_descriptor; cuda::tensor_descriptor cudnn_descriptor;
...@@ -227,9 +219,9 @@ namespace dlib ...@@ -227,9 +219,9 @@ namespace dlib
) )
{ {
return a.num_samples() == b.num_samples() && return a.num_samples() == b.num_samples() &&
a.k() == b.k() &&
a.nr() == b.nr() && a.nr() == b.nr() &&
a.nc() == b.nc() && a.nc() == b.nc();
a.k() == b.k();
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -242,10 +234,10 @@ namespace dlib ...@@ -242,10 +234,10 @@ namespace dlib
{} {}
explicit resizable_tensor( explicit resizable_tensor(
long n_, long nr_ = 1, long nc_ = 1, long k_ = 1 long n_, long k_ = 1, long nr_ = 1, long nc_ = 1
) )
{ {
set_size(n_,nr_,nc_,k_); set_size(n_,k_,nr_,nc_);
} }
resizable_tensor(const resizable_tensor&) = default; resizable_tensor(const resizable_tensor&) = default;
...@@ -265,7 +257,7 @@ namespace dlib ...@@ -265,7 +257,7 @@ namespace dlib
- resizes *this so that: have_same_dimensions(#*this, item)==true - resizes *this so that: have_same_dimensions(#*this, item)==true
!*/ !*/
{ {
set_size(item.num_samples(), item.nr(), item.nc(), item.k()); set_size(item.num_samples(), item.k(), item.nr(), item.nc());
} }
resizable_tensor& operator= (float val) resizable_tensor& operator= (float val)
...@@ -323,16 +315,16 @@ namespace dlib ...@@ -323,16 +315,16 @@ namespace dlib
} }
void set_size( void set_size(
long n_, long nr_ = 1, long nc_ = 1, long k_ = 1 long n_, long k_ = 1, long nr_ = 1, long nc_ = 1
) )
{ {
m_n = n_; m_n = n_;
m_k = k_;
m_nr = nr_; m_nr = nr_;
m_nc = nc_; m_nc = nc_;
m_k = k_; data.set_size(m_n*m_k*m_nr*m_nc);
data.set_size(m_n*m_nr*m_nc*m_k);
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cudnn_descriptor.set_size(m_n,m_nr,m_nc,m_k); cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
#endif #endif
} }
}; };
...@@ -342,9 +334,9 @@ namespace dlib ...@@ -342,9 +334,9 @@ namespace dlib
int version = 1; int version = 1;
serialize(version, out); serialize(version, out);
serialize(item.num_samples(), out); serialize(item.num_samples(), out);
serialize(item.k(), out);
serialize(item.nr(), out); serialize(item.nr(), out);
serialize(item.nc(), out); serialize(item.nc(), out);
serialize(item.k(), out);
auto data = item.host(); auto data = item.host();
for (size_t i = 0; i < item.size(); ++i) for (size_t i = 0; i < item.size(); ++i)
serialize(data[i], out); serialize(data[i], out);
...@@ -357,12 +349,12 @@ namespace dlib ...@@ -357,12 +349,12 @@ namespace dlib
if (version != 1) if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor."); throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
long num_samples=0, nr=0, nc=0, k=0; long num_samples=0, k=0, nr=0, nc=0;
deserialize(num_samples, in); deserialize(num_samples, in);
deserialize(k, in);
deserialize(nr, in); deserialize(nr, in);
deserialize(nc, in); deserialize(nc, in);
deserialize(k, in); item.set_size(num_samples, k, nr, nc);
item.set_size(num_samples, nr, nc, k);
auto data = item.host(); auto data = item.host();
for (size_t i = 0; i < item.size(); ++i) for (size_t i = 0; i < item.size(); ++i)
deserialize(data[i], in); deserialize(data[i], in);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment