Changed tensor layout from NHWC to NCHW since this layout is much faster for

cuDNN.

Changed tensor layout from NHWC to NCHW since this layout is much faster for
cuDNN.
c49631dd · Davis King · 30c4ee54 · c49631dd · c49631dd · c49631dd
Commit c49631dd authored Oct 20, 2015 by Davis King
6 changed files
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -1362,7 +1362,7 @@ namespace dlib
    namespace timpl
    {
-        void fill_with_gassuan_random_numbers (
+        inline void fill_with_gassuan_random_numbers (
            tensor& t,
            dlib::rand& rnd,
            double sigma = 1
@@ -1383,12 +1383,12 @@ namespace dlib
                // Output and gradient_input have to have the same dimensions in each
                // layer.
                const long num_samples = rnd.get_random_32bit_number()%4+3;
+                const long k  = rnd.get_random_32bit_number()%4+2;
                const long nr = rnd.get_random_32bit_number()%4+2;
                const long nc = rnd.get_random_32bit_number()%4+2;
-                const long k  = rnd.get_random_32bit_number()%4+2;
-                output.set_size(num_samples, nr, nc, k);
+                output.set_size(num_samples, k, nr, nc);
-                gradient_input.set_size(num_samples, nr, nc, k);
+                gradient_input.set_size(num_samples, k, nr, nc);
                // Use a non-zero initial gradient to make sure the layers add to it
                // rather than assign and blow away the initial value.
@@ -1447,7 +1447,8 @@ namespace dlib
        };
-        void print_tensor(
+        // TODO, remove?
+        inline void print_tensor(
            const tensor& a
        )
        {

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -85,9 +85,9 @@ namespace dlib
        void tensor_descriptor::
        set_size(
            int n, 
+            int k,
            int nr, 
-            int nc, 
+            int nc 
-            int k
        )
        {
            if (n == 0 || nr == 0 || nc == 0 || k == 0)
@@ -105,7 +105,7 @@ namespace dlib
                handle = h;
                check(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
-                        CUDNN_TENSOR_NHWC,
+                        CUDNN_TENSOR_NCHW,
                        CUDNN_DATA_FLOAT,
                        n,
                        k,
@@ -117,9 +117,9 @@ namespace dlib
        void tensor_descriptor::
        get_size (
            int& n, 
-            int& nr, 
+            int& k,
-            int& nc, 
+            int& nr,
-            int& k
+            int& nc
        ) const
        {
            if (handle)
@@ -140,9 +140,9 @@ namespace dlib
            else
            {
                n = 0;
+                k = 0;
                nr = 0;
                nc = 0;
-                k = 0;
            }
        }
@@ -254,7 +254,7 @@ namespace dlib
                        &out_nc));
                tensor_descriptor dest_desc;
-                dest_desc.set_size(out_num_samples,out_nr,out_nc,out_k);
+                dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc);
                cudnnConvolutionFwdAlgo_t forward_best_algo;
                check(cudnnGetConvolutionForwardAlgorithm(
@@ -299,7 +299,7 @@ namespace dlib
            const tensor& filters
        )
        {
-            output.set_size(out_num_samples, out_nr, out_nc, out_k);
+            output.set_size(out_num_samples, out_k, out_nr, out_nc);
            // TODO, remove

--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -37,9 +37,9 @@ namespace dlib
            void set_size(
                int n, 
+                int k,
                int nr, 
-                int nc, 
+                int nc 
-                int k
            );
            /*!
                ensures
@@ -48,9 +48,9 @@ namespace dlib
            void get_size (
                int& n, 
-                int& nr, 
+                int& k,
-                int& nc, 
+                int& nr,
-                int& k
+                int& nc 
            ) const;
            const void* get_handle (
@@ -209,9 +209,9 @@ namespace dlib
            // dimensions of the output tensor from operator()
            int out_num_samples;
+            int out_k;
            int out_nr;
            int out_nc;
-            int out_k;
            int forward_algo;
            size_t forward_workspace_size_in_bytes;

--- a/dlib/dnn/input.h
+++ b/dlib/dnn/input.h
@@ -56,8 +56,9 @@ namespace dlib
            // initialize data to the right size to contain the stuff in the iterator range.
-            data.set_size(std::distance(ibegin,iend), nr, nc, pixel_traits<T>::num);
+            data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
+            const size_t offset = nr*nc;
            auto ptr = data.host();
            for (auto i = ibegin; i != iend; ++i)
            {
@@ -66,10 +67,15 @@ namespace dlib
                    for (long c = 0; c < nc; ++c)
                    {
                        auto temp = pixel_to_vector<float>((*i)(r,c));
+                        auto p = ptr++;
                        for (long j = 0; j < temp.size(); ++j)
-                            *ptr++ = temp(j);
+                        {
+                            *p = temp(j);
+                            p += offset;
+                        }
                    }
                }
+                ptr += offset*(data.k()-1);
            }
        }
@@ -123,8 +129,9 @@ namespace dlib
            // initialize data to the right size to contain the stuff in the iterator range.
-            data.set_size(std::distance(ibegin,iend), nr, nc, pixel_traits<T>::num);
+            data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
+            const size_t offset = nr*nc;
            auto ptr = data.host();
            for (auto i = ibegin; i != iend; ++i)
            {
@@ -133,10 +140,15 @@ namespace dlib
                    for (long c = 0; c < nc; ++c)
                    {
                        auto temp = pixel_to_vector<float>((*i)[r][c]);
+                        auto p = ptr++;
                        for (long j = 0; j < temp.size(); ++j)
-                            *ptr++ = temp(j);
+                        {
+                            *p = temp(j);
+                            p += offset;
+                        }
                    }
                }
+                ptr += offset*(data.k()-1);
            }
        }

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -83,7 +83,7 @@ namespace dlib
        template <typename SUBNET>
        void forward(const SUBNET& sub, resizable_tensor& output)
        {
-            output.set_size(sub.get_output().num_samples(), 1,1,num_outputs);
+            output.set_size(sub.get_output().num_samples(), num_outputs);
            output = mat(sub.get_output())*mat(params);
        } 

--- a/dlib/dnn/tensor.h
+++ b/dlib/dnn/tensor.h
@@ -22,30 +22,22 @@ namespace dlib
        tensor (
        ) : 
-            m_n(0), m_nr(0), m_nc(0), m_k(0)
+            m_n(0), m_k(0), m_nr(0), m_nc(0)
        {
        }
        inline virtual ~tensor() = 0;
        long num_samples() const { return m_n; }
+        long k() const { return m_k; }
        long nr() const { return m_nr; }
        long nc() const { return m_nc; }
-        long k() const { return m_k; }
        size_t size() const { return data.size(); }
        void async_copy_to_device() 
        {
            data.async_copy_to_device();
        }
-        /*!
-            ensures
-                - begin asynchronously copying this tensor to the GPU.
-                NOTE that the "get device pointer" routine in this class
-                will have to do some kind of synchronization that ensures
-                the copy is finished.
-        !*/
        const float* host() const { return data.host(); }
        float*       host()       { return data.host(); }
@@ -135,13 +127,13 @@ namespace dlib
        tensor& operator= (const tensor& item) 
        {
            m_n  = item.m_n;
+            m_k  = item.m_k;
            m_nr = item.m_nr;
            m_nc = item.m_nc;
-            m_k  = item.m_k;
            data.set_size(item.data.size());
            std::memcpy(data.host(), item.data.host(), data.size()*sizeof(float));
 #ifdef DLIB_USE_CUDA
-            cudnn_descriptor.set_size(m_n,m_nr,m_nc,m_k);
+            cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
 #endif
            return *this;
        }
@@ -159,9 +151,9 @@ namespace dlib
        void swap(tensor& item)
        {
            std::swap(m_n,    item.m_n);
+            std::swap(m_k,    item.m_k);
            std::swap(m_nr,   item.m_nr);
            std::swap(m_nc,   item.m_nc);
-            std::swap(m_k,    item.m_k);
            std::swap(data, item.data);
 #ifdef DLIB_USE_CUDA
            std::swap(cudnn_descriptor, item.cudnn_descriptor);
@@ -170,9 +162,9 @@ namespace dlib
        long m_n;
+        long m_k;
        long m_nr;
        long m_nc;
-        long m_k;
        gpu_data data;
 #ifdef DLIB_USE_CUDA
        cuda::tensor_descriptor cudnn_descriptor;
@@ -227,9 +219,9 @@ namespace dlib
    )
    {
        return a.num_samples() == b.num_samples() &&
+               a.k()  == b.k() &&
               a.nr() == b.nr() &&
-               a.nc() == b.nc() &&
+               a.nc() == b.nc();
-               a.k()  == b.k();
    }
 // ----------------------------------------------------------------------------------------
@@ -242,10 +234,10 @@ namespace dlib
        {}
        explicit resizable_tensor(
-            long n_, long nr_ = 1, long nc_ = 1, long k_ = 1
+            long n_, long k_ = 1, long nr_ = 1, long nc_ = 1
        ) 
        {
-            set_size(n_,nr_,nc_,k_);
+            set_size(n_,k_,nr_,nc_);
        }
        resizable_tensor(const resizable_tensor&) = default;
@@ -265,7 +257,7 @@ namespace dlib
                - resizes *this so that: have_same_dimensions(#*this, item)==true
        !*/
        {
-            set_size(item.num_samples(), item.nr(), item.nc(), item.k());
+            set_size(item.num_samples(), item.k(), item.nr(), item.nc());
        }
        resizable_tensor& operator= (float val)
@@ -323,16 +315,16 @@ namespace dlib
        }
        void set_size(
-            long n_, long nr_ = 1, long nc_ = 1, long k_ = 1
+            long n_, long k_ = 1, long nr_ = 1, long nc_ = 1
        )
        {
            m_n = n_;
+            m_k = k_;
            m_nr = nr_;
            m_nc = nc_;
-            m_k = k_;
+            data.set_size(m_n*m_k*m_nr*m_nc);
-            data.set_size(m_n*m_nr*m_nc*m_k);
 #ifdef DLIB_USE_CUDA
-            cudnn_descriptor.set_size(m_n,m_nr,m_nc,m_k);
+            cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
 #endif
        }
    };
@@ -342,9 +334,9 @@ namespace dlib
        int version = 1;
        serialize(version, out);
        serialize(item.num_samples(), out);
+        serialize(item.k(), out);
        serialize(item.nr(), out);
        serialize(item.nc(), out);
-        serialize(item.k(), out);
        auto data = item.host();
        for (size_t i = 0; i < item.size(); ++i)
            serialize(data[i], out);
@@ -357,12 +349,12 @@ namespace dlib
        if (version != 1)
            throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
-        long num_samples=0, nr=0, nc=0, k=0;
+        long num_samples=0, k=0, nr=0, nc=0;
        deserialize(num_samples, in);
+        deserialize(k, in);
        deserialize(nr, in);
        deserialize(nc, in);
-        deserialize(k, in);
+        item.set_size(num_samples, k, nr, nc);
-        item.set_size(num_samples, nr, nc, k);
        auto data = item.host();
        for (size_t i = 0; i < item.size(); ++i)
            deserialize(data[i], in);