merged

ecc00ce7 · Davis King · 573fce15 · 99ce564b · ecc00ce7 · ecc00ce7
Commit ecc00ce7 authored Feb 01, 2016 by Davis King
6 changed files
--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -240,7 +240,7 @@ namespace dlib
                return;
            CHECK_CUDNN(cudnnSetTensor(context(),
                                 descriptor(t),
-                                 t.device(),
+                                 t.device_write_only(),
                                 &value));
        }

--- a/dlib/dnn/gpu_data.h
+++ b/dlib/dnn/gpu_data.h
@@ -95,6 +95,13 @@ namespace dlib
            return data_host.get(); 
        }
+        float* host_write_only() 
+        {
+            host_current = true;
+            device_current = false;
+            return data_host.get(); 
+        }
        const float* device() const 
        { 
 #ifndef DLIB_USE_CUDA
@@ -116,6 +123,18 @@ namespace dlib
            return data_device.get(); 
        }
+        float* device_write_only()
+        {
+#ifndef DLIB_USE_CUDA
+            DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+            wait_for_transfer_to_finish();
+            host_current = false;
+            device_current = true;
+            device_in_use = true;
+            return data_device.get(); 
+        }
        bool host_ready (
        ) const { return host_current; }

--- a/dlib/dnn/gpu_data_abstract.h
+++ b/dlib/dnn/gpu_data_abstract.h
@@ -125,6 +125,20 @@ namespace dlib
                  calling host().
        !*/
+        float* host_write_only(
+        );
+        /*!
+            ensures
+                - This function returns the same pointer as host(), except that it never
+                  performs a device to host memory copy.  Instead, it immediately marks the
+                  device side data as out of date, effectively discarding it.  Therefore,
+                  the values in the data pointed to by host_write_only() are undefined and
+                  you should only call host_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+                - #host_ready() == true
+                - #device_ready() == false 
+        !*/
        const float* device(
        ) const;
        /*!
@@ -154,6 +168,23 @@ namespace dlib
                - #device_ready() == true
        !*/
+        float* device_write_only(
+        );
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - This function returns the same pointer as device(), except that it never
+                  performs a host to device memory copy.  Instead, it immediately marks the
+                  host side data as out of date, effectively discarding it.  Therefore, the
+                  values in the data pointed to by device_write_only() are undefined and
+                  you should only call device_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+                - #host_ready() == false 
+                - #device_ready() == true 
+        !*/
        size_t size(
        ) const; 
        /*!

--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@@ -68,7 +68,7 @@ namespace dlib
            const double scale = 1.0/output_tensor.num_samples();
            double loss = 0;
            const float* out_data = output_tensor.host();
-            float* g = grad.host();
+            float* g = grad.host_write_only();
            for (long i = 0; i < output_tensor.num_samples(); ++i)
            {
                const float y = *truth++;
@@ -79,6 +79,10 @@ namespace dlib
                    loss += scale*temp;
                    g[i] = -scale*y;
                }
+                else
+                {
+                    g[i] = 0;
+                }
            }
            return loss;
        }

--- a/dlib/dnn/tensor.h
+++ b/dlib/dnn/tensor.h
@@ -8,6 +8,7 @@
 #include "../matrix.h"
 #include "cudnn_dlibapi.h"
 #include "gpu_data.h"
+#include "../byte_orderer.h"
 #include <memory>
 namespace dlib
@@ -46,8 +47,10 @@ namespace dlib
        virtual const float* host() const = 0;
        virtual float*       host() = 0; 
+        virtual float*       host_write_only() = 0;
        virtual const float* device() const = 0;
        virtual float*       device() = 0;
+        virtual float*       device_write_only() = 0;
        tensor& operator= (float val)
        {
@@ -62,8 +65,9 @@ namespace dlib
                return *this;
            }
 #endif
-            for (auto& d : *this)
+            auto d = host_write_only();
-                d = val;
+            for (size_t i = 0; i < size(); ++i)
+                d[i] = val;
            return *this;
        }
@@ -95,7 +99,7 @@ namespace dlib
            static_assert((is_same_type<float, typename EXP::type>::value == true),
                "To assign a matrix to a tensor the matrix must contain float values");
-            set_ptrm(host(), m_n, m_nr*m_nc*m_k) = item;
+            set_ptrm(host_write_only(), m_n, m_nr*m_nc*m_k) = item;
            return *this;
        }
@@ -279,8 +283,10 @@ namespace dlib
        virtual const float* host() const { return data_instance.host(); }
        virtual float*       host()       { return data_instance.host(); }
+        virtual float*       host_write_only() { return data_instance.host_write_only(); }
        virtual const float* device() const { return data_instance.device(); }
        virtual float*       device()       { return data_instance.device(); }
+        virtual float*       device_write_only() { return data_instance.device_write_only(); }
        void clear(
        )
@@ -373,21 +379,33 @@ namespace dlib
    inline void serialize(const tensor& item, std::ostream& out)
    {
-        int version = 1;
+        int version = 2;
        serialize(version, out);
        serialize(item.num_samples(), out);
        serialize(item.k(), out);
        serialize(item.nr(), out);
        serialize(item.nc(), out);
-        for (auto& d : item)
+        byte_orderer bo;
-            serialize(d, out);
+        auto sbuf = out.rdbuf();
+        for (auto d : item)
+        {
+            // Write out our data as 4byte little endian IEEE floats rather than using
+            // dlib's default float serialization.  We do this because it will result in
+            // more compact outputs.  It's slightly less portable but it seems doubtful
+            // that any CUDA enabled platform isn't going to use IEEE floats.  But if one
+            // does we can just update the serialization code here to handle it if such a
+            // platform is encountered.
+            bo.host_to_little(d);
+            static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+            sbuf->sputn((char*)&d, sizeof(d));
+        }
    }
    inline void deserialize(resizable_tensor& item, std::istream& in)
    {
        int version;
        deserialize(version, in);
-        if (version != 1)
+        if (version != 2)
            throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
        long num_samples=0, k=0, nr=0, nc=0;
@@ -396,8 +414,18 @@ namespace dlib
        deserialize(nr, in);
        deserialize(nc, in);
        item.set_size(num_samples, k, nr, nc);
+        byte_orderer bo;
+        auto sbuf = in.rdbuf();
        for (auto& d : item)
-            deserialize(d, in);
+        {
+            static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+            if (sbuf->sgetn((char*)&d,sizeof(d)) != sizeof(d))
+            {
+                in.setstate(std::ios::badbit);
+                throw serialization_error("Error reading data while deserializing dlib::resizable_tensor.");
+            }
+            bo.little_to_host(d);
+        }
    }
 // ----------------------------------------------------------------------------------------
@@ -441,8 +469,10 @@ namespace dlib
        virtual const float* host() const { return data_instance->host()+data_offset; }
        virtual float*       host()       { return data_instance->host()+data_offset; }
+        virtual float*       host_write_only()    { return data_instance->host()+data_offset; }
        virtual const float* device() const { return data_instance->device()+data_offset; }
        virtual float*       device()       { return data_instance->device()+data_offset; }
+        virtual float*       device_write_only()  { return data_instance->device()+data_offset; }
 #ifdef DLIB_USE_CUDA

--- a/dlib/dnn/tensor_abstract.h
+++ b/dlib/dnn/tensor_abstract.h
@@ -133,6 +133,18 @@ namespace dlib
                  calling host().
        !*/
+        float float* host_write_only(
+        ) = 0;
+        /*!
+            ensures
+                - This function returns the same pointer as host(), except that it never
+                  performs a device to host memory copy.  Instead, it immediately marks the
+                  device side data as out of date, effectively discarding it.  Therefore,
+                  the values in the data pointed to by host_write_only() are undefined and
+                  you should only call host_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+        !*/
        virtual const float* device(
        ) const = 0;
        /*!
@@ -161,6 +173,20 @@ namespace dlib
                  host() will perform a device to host transfer.
        !*/
+        float float* device_write_only(
+        ) = 0;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - This function returns the same pointer as device(), except that it never
+                  performs a host to device memory copy.  Instead, it immediately marks the
+                  host side data as out of date, effectively discarding it.  Therefore, the
+                  values in the data pointed to by device_write_only() are undefined and
+                  you should only call device_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+        !*/
        tensor& operator= (
            float val
        );