Commit cc5a62cd authored by Davis King's avatar Davis King

Made affine_transform() routines a little faster.

parent 919cbd11
...@@ -237,6 +237,14 @@ namespace dlib ...@@ -237,6 +237,14 @@ namespace dlib
} }
} }
__global__ void _cuda_affine_transform1_0(float* d, const float* s, size_t n, float A)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = A*s[i];
}
}
void affine_transform( void affine_transform(
tensor& dest, tensor& dest,
const tensor& src, const tensor& src,
...@@ -245,7 +253,10 @@ namespace dlib ...@@ -245,7 +253,10 @@ namespace dlib
) )
{ {
DLIB_CASSERT(dest.size()==src.size(),""); DLIB_CASSERT(dest.size()==src.size(),"");
launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B); if (B != 0)
launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B);
else
launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -258,6 +269,14 @@ namespace dlib ...@@ -258,6 +269,14 @@ namespace dlib
} }
} }
__global__ void _cuda_affine_transform4_0(float* d, const float* s1, const float* s2, size_t n, float A, float B)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = A*s1[i] + B*s2[i];
}
}
void affine_transform( void affine_transform(
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
...@@ -269,7 +288,10 @@ namespace dlib ...@@ -269,7 +288,10 @@ namespace dlib
{ {
DLIB_CASSERT(dest.size()==src1.size(),""); DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),""); DLIB_CASSERT(dest.size()==src2.size(),"");
launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C); if (C != 0)
launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C);
else
launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B);
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment