Optimized the matrix multiply a little more.

--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%402742

Optimized the matrix multiply a little more.
--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%402742
ded490dc · Davis King · 7b53a1cb · ded490dc · ded490dc
Commit ded490dc authored Dec 19, 2008 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 10 deletions

matrix_assign.h dlib/matrix/matrix_assign.h +28 -10

matrix_assign_fwd.h dlib/matrix/matrix_assign_fwd.h +2 -0

No files found.
--- a/dlib/matrix/matrix_assign.h
+++ b/dlib/matrix/matrix_assign.h
@@ -82,18 +82,25 @@ namespace dlib
    !*/
    {
        using namespace ma;
-        const matrix_exp<EXP1>& lhs = src.lhs;
-        const matrix_exp<EXP2>& rhs = src.rhs;
-        const long bs = 100;
+        const EXP1& lhs = src.lhs;
+        const EXP2& rhs = src.rhs;
+        const long bs = 90;
+        set_all_elements(dest,0);

        // if the matrices are small enough then just use the simple multiply algorithm
        if (lhs.nc() <= 2 || rhs.nc() <= 2 || lhs.nr() <= 2 || rhs.nr() <= 2 || (lhs.size() <= bs*10 && rhs.size() <= bs*10) )
        {
-            for (long r = 0; r < src.nr(); ++r)
+            // This loop is optimized assuming that the data is laid out in 
+            // row major order in memory.
+            for (long r = 0; r< lhs.nr(); ++r)
            {
-                for (long c = 0; c < src.nc(); ++c)
+                for (long c = 0; c< lhs.nc(); ++c)
                {
-                    dest(r,c) = src(r,c);
+                    const typename EXP2::type temp = lhs(r,c);
+                    for (long i = 0; i < rhs.nc(); ++i)
+                    {
+                        dest(r,i) += rhs(c,i)*temp;
+                    }
                }
            }
        }
@@ -102,6 +109,7 @@ namespace dlib
            // if the lhs and rhs matrices are big enough we should use a cache friendly
            // algorithm that computes the matrix multiply in blocks.  

+
            // Loop over all the blocks in the lhs matrix
            for (long r = 0; r < lhs.nr(); r+=bs)
            {
@@ -118,10 +126,20 @@ namespace dlib

                        // make a target rect in res
                        rectangle res_block(rhs_block.left(),lhs_block.top(), rhs_block.right(), lhs_block.bottom());
-                        if (c != 0)
-                            set_subm(dest, res_block) = subm(dest,res_block) + subm(lhs,lhs_block)*subm(rhs, rhs_block);
-                        else
-                            set_subm(dest, res_block) = null_exp(subm(lhs,lhs_block)*subm(rhs, rhs_block));
+
+                        // This loop is optimized assuming that the data is laid out in 
+                        // row major order in memory.
+                        for (long r = lhs_block.top(); r <= lhs_block.bottom(); ++r)
+                        {
+                            for (long c = lhs_block.left(); c<= lhs_block.right(); ++c)
+                            {
+                                const typename EXP2::type temp = lhs(r,c);
+                                for (long i = rhs_block.left(); i <= rhs_block.right(); ++i)
+                                {
+                                    dest(r,i) += rhs(c,i)*temp;
+                                }
+                            }
+                        }
                    }
                }
            }

--- a/dlib/matrix/matrix_assign_fwd.h
+++ b/dlib/matrix/matrix_assign_fwd.h
@@ -36,8 +36,10 @@ namespace dlib
 // inline behavior out of GCC.
 #ifdef __GNUC__
 #define DLIB_DONT_INLINE __attribute__((noinline))
+#define DLIB_ALWAYS_INLINE __attribute__((always_inline))
 #else
 #define DLIB_DONT_INLINE 
+#define DLIB_ALWAYS_INLINE 
 #endif

    template <