Commit ded490dc authored by Davis King's avatar Davis King

Optimized the matrix multiply a little more.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%402742
parent 7b53a1cb
...@@ -82,18 +82,25 @@ namespace dlib ...@@ -82,18 +82,25 @@ namespace dlib
!*/ !*/
{ {
using namespace ma; using namespace ma;
const matrix_exp<EXP1>& lhs = src.lhs; const EXP1& lhs = src.lhs;
const matrix_exp<EXP2>& rhs = src.rhs; const EXP2& rhs = src.rhs;
const long bs = 100; const long bs = 90;
set_all_elements(dest,0);
// if the matrices are small enough then just use the simple multiply algorithm // if the matrices are small enough then just use the simple multiply algorithm
if (lhs.nc() <= 2 || rhs.nc() <= 2 || lhs.nr() <= 2 || rhs.nr() <= 2 || (lhs.size() <= bs*10 && rhs.size() <= bs*10) ) if (lhs.nc() <= 2 || rhs.nc() <= 2 || lhs.nr() <= 2 || rhs.nr() <= 2 || (lhs.size() <= bs*10 && rhs.size() <= bs*10) )
{ {
for (long r = 0; r < src.nr(); ++r) // This loop is optimized assuming that the data is laid out in
// row major order in memory.
for (long r = 0; r< lhs.nr(); ++r)
{ {
for (long c = 0; c < src.nc(); ++c) for (long c = 0; c< lhs.nc(); ++c)
{ {
dest(r,c) = src(r,c); const typename EXP2::type temp = lhs(r,c);
for (long i = 0; i < rhs.nc(); ++i)
{
dest(r,i) += rhs(c,i)*temp;
}
} }
} }
} }
...@@ -102,6 +109,7 @@ namespace dlib ...@@ -102,6 +109,7 @@ namespace dlib
// if the lhs and rhs matrices are big enough we should use a cache friendly // if the lhs and rhs matrices are big enough we should use a cache friendly
// algorithm that computes the matrix multiply in blocks. // algorithm that computes the matrix multiply in blocks.
// Loop over all the blocks in the lhs matrix // Loop over all the blocks in the lhs matrix
for (long r = 0; r < lhs.nr(); r+=bs) for (long r = 0; r < lhs.nr(); r+=bs)
{ {
...@@ -118,10 +126,20 @@ namespace dlib ...@@ -118,10 +126,20 @@ namespace dlib
// make a target rect in res // make a target rect in res
rectangle res_block(rhs_block.left(),lhs_block.top(), rhs_block.right(), lhs_block.bottom()); rectangle res_block(rhs_block.left(),lhs_block.top(), rhs_block.right(), lhs_block.bottom());
if (c != 0)
set_subm(dest, res_block) = subm(dest,res_block) + subm(lhs,lhs_block)*subm(rhs, rhs_block); // This loop is optimized assuming that the data is laid out in
else // row major order in memory.
set_subm(dest, res_block) = null_exp(subm(lhs,lhs_block)*subm(rhs, rhs_block)); for (long r = lhs_block.top(); r <= lhs_block.bottom(); ++r)
{
for (long c = lhs_block.left(); c<= lhs_block.right(); ++c)
{
const typename EXP2::type temp = lhs(r,c);
for (long i = rhs_block.left(); i <= rhs_block.right(); ++i)
{
dest(r,i) += rhs(c,i)*temp;
}
}
}
} }
} }
} }
......
...@@ -36,8 +36,10 @@ namespace dlib ...@@ -36,8 +36,10 @@ namespace dlib
// inline behavior out of GCC. // inline behavior out of GCC.
#ifdef __GNUC__ #ifdef __GNUC__
#define DLIB_DONT_INLINE __attribute__((noinline)) #define DLIB_DONT_INLINE __attribute__((noinline))
#define DLIB_ALWAYS_INLINE __attribute__((always_inline))
#else #else
#define DLIB_DONT_INLINE #define DLIB_DONT_INLINE
#define DLIB_ALWAYS_INLINE
#endif #endif
template < template <
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment