Switched 2D kernels to use the new 2D launch_kernel(). Also added overload of

dot_prods() that can accumulate in addition to assign.

Switched 2D kernels to use the new 2D launch_kernel(). Also added overload of
dot_prods() that can accumulate in addition to assign.
546bdf51 · Davis King · 19b16e1a · 546bdf51 · 546bdf51 · 546bdf51
Commit 546bdf51 authored Jan 26, 2018 by Davis King
5 changed files
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -142,9 +142,8 @@ namespace dlib
        )
        {
            invnorms.set_size(data.num_samples());
-            dim3 blocks(1,10);  // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
-            dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
-            _cuda_inverse_norms<<<blocks,threads>>>(invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
+            launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()),
+                invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
        }

    // ----------------------------------------------------------------------------------------
@@ -170,16 +169,57 @@ namespace dlib
            }
        }

+        __global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+        {
+            for (auto i : grid_stride_range_y(0, nr))
+            {
+                auto l = lhs + i*nc;
+                auto r = rhs + i*nc;
+                float temp = 0;
+                for (auto j : grid_stride_range(0, nc))
+                    temp += l[j]*r[j];
+
+                // and store the sum into out[i]
+                warp_reduce_atomic_add(out[i], temp);
+            }
+        }
+
        void dot_prods (
            resizable_tensor& out,
            const tensor& lhs,
            const tensor& rhs
        )
        {
+            DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+
            out.set_size(lhs.num_samples());
-            dim3 blocks(1,10);  // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
-            dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
-            _cuda_dot_prods<<<blocks,threads>>>(out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
+            if (out.size() == 0)
+                return;
+
+            const auto nr = lhs.num_samples();
+            const auto nc = lhs.size()/lhs.num_samples();
+
+            launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
+        }
+
+        void dot_prods (
+            bool add_to,
+            tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+            DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1);
+            DLIB_CASSERT(out.size() == lhs.num_samples());
+
+            const auto nr = lhs.num_samples();
+            const auto nc = lhs.size()/lhs.num_samples();
+
+            if (add_to)
+                launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc);
+            else
+                launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
        }

    // ----------------------------------------------------------------------------------------
@@ -501,14 +541,15 @@ namespace dlib
                if (dest.size() == 0)
                    return;

-                dim3 blocks(1,10);  // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
-                dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
+
+                const auto bs = src1.nr()*src1.nc();
+                const auto n = src1.num_samples()*src1.k();
                if (add_to)
-                    _cuda_multiply_conv2_add_to<<<blocks,threads>>>(
-                        dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
+                    launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n),
+                        dest.device(), src1.device(), n, src2.device(), bs, src1.k());
                else
-                    _cuda_multiply_conv2<<<blocks,threads>>>(
-                        dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
+                    launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n),
+                        dest.device(), src1.device(), n, src2.device(), bs, src1.k());
            }

        }

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -121,6 +121,13 @@ namespace dlib
            const tensor& rhs
        );

+        void dot_prods (
+            bool add_to,
+            tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        );
+
        void scale_columns (
            tensor& out,
            const tensor& m,

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -69,6 +69,23 @@ namespace dlib { namespace tt
 #endif
    }

+    void dot_prods (
+        bool add_to,
+        tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::dot_prods(add_to, out, lhs, rhs);
+#else
+        if (add_to)
+            out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+        else
+            out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+#endif
+    }
+
    void scale_columns (
        tensor& out,
        const tensor& m,

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -50,6 +50,24 @@ namespace dlib { namespace tt
            - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
    !*/

+    void dot_prods (
+        bool add_to,
+        tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    );
+    /*!
+        requires
+            - have_same_dimensions(lhs,rhs) == true
+            - out.size() == lhs.num_samples()
+            - out.k() == out.nr() == out.nc() == 1
+        ensures
+            - if (add_to) then
+                - #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+            - else
+                - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+    !*/
+
    void scale_columns (
        tensor& out,
        const tensor& m,

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -1256,6 +1256,24 @@ namespace
            out2 = scale_rows(mat(data), mat(invnorms1));
            DLIB_TEST(max(abs(mat(out1)-mat(out2))) < 1e-6);
        }
+
+        {
+            resizable_tensor a(123,432), b(123,432);
+            rnd.fill_gaussian(a);
+            rnd.fill_gaussian(b);
+
+            resizable_tensor out;
+            dot_prods(out, a,b);
+            const matrix<float> truth = sum_cols(pointwise_multiply(mat(a), mat(b)));
+            DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
+            out = 0;
+            DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
+            dot_prods(false, out, a,b);
+            DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
+            dot_prods(true, out, a,b);
+            DLIB_TEST(max(abs(mat(out)/2 - truth)) < 1e-4);
+            DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
+        }
    }

 // ----------------------------------------------------------------------------------------