Commit c3a74c7c authored by Davis King's avatar Davis King

Added affine_transform_range() and another overload of affine_transform()

parent 15b2d7b5
......@@ -385,6 +385,30 @@ namespace dlib
d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),"");
DLIB_CASSERT(dest.size()==src3.size(),"");
DLIB_CASSERT(begin <= end && end <= dest.size(),"");
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
const auto s3 = src3.host();
for (size_t i = begin; i < end; ++i)
d[i] = A*s1[i] + B*s2[i] + C*s3[i];
}
// -----------------------------------------------------------------------------------
void affine_transform(
......
......@@ -81,6 +81,18 @@ namespace dlib
const float D
);
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// -----------------------------------------------------------------------------------
void affine_transform(
......
......@@ -504,6 +504,40 @@ namespace dlib
src2.device(), src3.device(), dest.size(), A, B, C, D);
}
// ----------------------------------------------------------------------------------------
__global__ void _cuda_affine_transform_range(
float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
)
{
for (auto i : grid_stride_range(begin, end))
{
d[i] = A*s1[i] + B*s2[i] + C*s3[i];
}
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),"");
DLIB_CASSERT(dest.size()==src3.size(),"");
DLIB_CASSERT(begin <= end && end <= dest.size(),"");
launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
dest.device(), src1.device(),
src2.device(), src3.device(), begin, end, A, B, C);
}
// -----------------------------------------------------------------------------------
__global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
......
......@@ -164,6 +164,18 @@ namespace dlib
const float D
);
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// Note that this function isn't in the tt:: namespace because add_scaled() is
// called by cuda::add() so we don't need a tt:: version of add_scaled().
void add_scaled(
......
......@@ -240,6 +240,42 @@ namespace dlib { namespace tt
#endif
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
#else
cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
#endif
}
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
#else
cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
#endif
}
// ----------------------------------------------------------------------------------------
void affine_transform(
......
......@@ -229,13 +229,58 @@ namespace dlib { namespace tt
const float D
);
/*!
requires - dest.size()==src1.size()
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
ensures
- #dest == A*src1 + B*src2 + C*src3 + D
!*/
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
ensures
- #dest == A*src1 + B*src2 + C*src3
!*/
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
- begin <= end <= dest.size()
ensures
- This function operates much like
affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
the half open range [begin,end) rather than processing the entire tensor.
Specifically, it does this:
- for i in the range [begin, end):
- #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
!*/
// ----------------------------------------------------------------------------------------
void affine_transform(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment