Added tt::scale_columns(). Also optimized some other cuda kernel launches a

little.

Added tt::scale_columns(). Also optimized some other cuda kernel launches a
little.
3586d409 · Davis King · 2a2956a2 · 3586d409 · 3586d409 · 3586d409
Commit 3586d409 authored Oct 26, 2016 by Davis King
Showing with 73 additions and 4 deletions

cuda_dlib.cu dlib/dnn/cuda_dlib.cu +25 -2

cuda_dlib.h dlib/dnn/cuda_dlib.h +6 -0

tensor_tools.cpp dlib/dnn/tensor_tools.cpp +26 -0

tensor_tools.h dlib/dnn/tensor_tools.h +16 -2

No files found.
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -142,7 +142,9 @@ namespace dlib
        )
        {
            invnorms.set_size(data.num_samples());
-            launch_kernel(_cuda_inverse_norms,max_jobs(data.size()), invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
+            dim3 blocks(10,1);
+            dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
+            _cuda_inverse_norms<<<blocks,threads>>>(invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
        }
    // ----------------------------------------------------------------------------------------
@@ -174,7 +176,28 @@ namespace dlib
        )
        {
            out.set_size(lhs.num_samples());
-            launch_kernel(_cuda_dot_prods, max_jobs(lhs.size()), out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
+            dim3 blocks(10,1);
+            dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
+            _cuda_dot_prods<<<blocks,threads>>>(out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
+        }
+    // ----------------------------------------------------------------------------------------
+        __global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = m[j]*v[j%nc];
+            }
+        }
+        void scale_columns (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        )
+        {
+            launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
        }
    // ----------------------------------------------------------------------------------------

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -120,6 +120,12 @@ namespace dlib
            const tensor& rhs
        );
+        void scale_columns (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        );
        void scale_rows (
            tensor& out,
            const tensor& m,

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -69,6 +69,27 @@ namespace dlib { namespace tt
 #endif
    }
+    void scale_columns (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    )
+    {
+        DLIB_CASSERT(have_same_dimensions(out,m));
+        DLIB_CASSERT(is_vector(v));
+        if (m.size() == 0 && v.size() == 0)
+            return;
+        DLIB_CASSERT(m.size() != 0);
+        DLIB_CASSERT(m.size()/m.num_samples() == v.size());
+#ifdef DLIB_USE_CUDA
+        cuda::scale_columns(out, m, v);
+#else
+        DLIB_CASSERT(false, "shouldn't be called right now");
+        out = scale_columns(mat(m), mat(v));
+#endif
+    }
    void scale_rows (
        tensor& out,
        const tensor& m,
@@ -76,6 +97,11 @@ namespace dlib { namespace tt
    )
    {
        DLIB_CASSERT(have_same_dimensions(out,m));
+        DLIB_CASSERT(is_vector(v));
+        if (m.size() == 0 && v.size() == 0)
+            return;
+        DLIB_CASSERT(m.size() != 0);
+        DLIB_CASSERT(m.num_samples() == v.size());
 #ifdef DLIB_USE_CUDA
        cuda::scale_rows(out, m, v);

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -48,6 +48,20 @@ namespace dlib { namespace tt
            - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
    !*/
+    void scale_columns (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    );
+    /*!
+        requires
+            - have_same_dimensions(out,m) == true
+            - is_vector(v) == true
+            - v.size() == mat(m).nc()
+        ensures
+            - performs: out = scale_columns(mat(m),mat(v));
+    !*/
    void scale_rows (
        tensor& out,
        const tensor& m,
@@ -56,7 +70,7 @@ namespace dlib { namespace tt
    /*!
        requires
            - have_same_dimensions(out,m) == true
-            - is_vector(mat(v)) == true
+            - is_vector(v) == true
            - v.size() == m.num_samples()
        ensures
            - performs: out = scale_rows(mat(m),mat(v));
@@ -75,7 +89,7 @@ namespace dlib { namespace tt
            - have_same_dimensions(out,m1) == true
            - have_same_dimensions(out,m2) == true
            - have_same_dimensions(v1,v2) == true
-            - is_vector(mat(v1)) == true
+            - is_vector(v1) == true
            - v1.size() == m1.num_samples()
        ensures
            - performs: