Commit b09ddc3a authored by Davis King's avatar Davis King

Fixed a bug in memcpy() for tensors where you weren't allowed to copy

alias tensors.  Now any kind of tensors are supported.
parent fe42f662
......@@ -24,20 +24,59 @@ namespace dlib
)
{
DLIB_CASSERT(dest.size() == src.size(), "");
if (src.size() == 0)
if (src.size() == 0 || &dest == &src)
return;
// copy the memory efficiently based on which copy is current in each object.
if (dest.device_ready() && src.device_ready())
CHECK_CUDA(cudaMemcpy(dest.device(), src.device(), src.size()*sizeof(float), cudaMemcpyDeviceToDevice));
else if (!dest.device_ready() && src.device_ready())
CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.device(), src.size()*sizeof(float), cudaMemcpyDeviceToHost));
else if (dest.device_ready() && !src.device_ready())
CHECK_CUDA(cudaMemcpy(dest.device(), src.host(), src.size()*sizeof(float), cudaMemcpyHostToDevice));
else
CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.host(), src.size()*sizeof(float), cudaMemcpyHostToHost));
memcpy(dest,0, src, 0, src.size());
}
void memcpy (
gpu_data& dest,
size_t dest_offset,
const gpu_data& src,
size_t src_offset,
size_t num
)
{
DLIB_CASSERT(dest_offset + num <= dest.size(), "");
DLIB_CASSERT(src_offset + num <= src.size(), "");
if (num == 0)
return;
// if there is aliasing
if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
{
// if they perfectly alias each other then there is nothing to do
if (dest_offset == src_offset)
return;
else
std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
}
else
{
// if we write to the entire thing then we can use device_write_only()
if (dest_offset == 0 && num == dest.size())
{
// copy the memory efficiently based on which copy is current in each object.
if (src.device_ready())
CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
else
CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice));
}
else
{
// copy the memory efficiently based on which copy is current in each object.
if (dest.device_ready() && src.device_ready())
CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
else if (!dest.device_ready() && src.device_ready())
CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost));
else if (dest.device_ready() && !src.device_ready())
CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice));
else
CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost));
}
}
}
// ----------------------------------------------------------------------------------------
void gpu_data::
......
......@@ -208,14 +208,54 @@ namespace dlib
#ifdef DLIB_USE_CUDA
void memcpy (gpu_data& dest, const gpu_data& src);
void memcpy (
gpu_data& dest,
size_t dest_offset,
const gpu_data& src,
size_t src_offset,
size_t num
);
#else
inline void memcpy (gpu_data& dest, const gpu_data& src)
{
DLIB_CASSERT(dest.size() == src.size(), "");
if (src.size() == 0)
if (src.size() == 0 || &dest == &src)
return;
std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size());
}
inline void memcpy (
gpu_data& dest,
size_t dest_offset,
const gpu_data& src,
size_t src_offset,
size_t num
)
{
DLIB_CASSERT(dest_offset + num <= dest.size(), "");
DLIB_CASSERT(src_offset + num <= src.size(), "");
if (num == 0)
return;
if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
{
// if they perfectly alias each other then there is nothing to do
if (dest_offset == src_offset)
return;
else
std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
}
else
{
// if we write to the entire thing then we can use host_write_only()
if (dest_offset == 0 && num == dest.size())
std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num);
else
std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
}
}
#endif
// ----------------------------------------------------------------------------------------
......
......@@ -233,6 +233,31 @@ namespace dlib
- This function blocks until the copy has completed.
!*/
void memcpy (
gpu_data& dest,
size_t dest_offset,
const gpu_data& src,
size_t src_offset,
size_t num
);
/*!
requires
- dest_offset + num <= dest.size()
- src_offset + num <= src.size()
ensures
- Copies the data in src to dest, but only copies data in the range
[src.host()+src_offset, src.host()+src_offset+num) to
[dest.host()+dest_offset, dest.host()+dest_offset+num). Therefore, it is
just like the above memcpy() except that you can specify some subset of data
in a gpu_data object to be copied.
- Like the above version of memcpy(), the copy will happen in the most
efficient way, automatically using the appropriate type of host/device
transfers based on where data is currently resident.
- It doesn't matter what GPU device is selected by cudaSetDevice(). You can
always copy gpu_data objects to and from each other regardless.
- This function blocks until the copy has completed.
!*/
// ----------------------------------------------------------------------------------------
}
......
......@@ -169,7 +169,10 @@ namespace dlib
const tensor& src
)
{
memcpy(dest.data(), src.data());
DLIB_CASSERT(dest.size() == src.size(), "");
memcpy(dest.data(), dest.get_alias_offset(),
src.data(), src.get_alias_offset(),
src.size());
}
......
......@@ -497,6 +497,103 @@ namespace
#endif
}
{
resizable_tensor A, B;
A.set_size(11);
B.copy_size(A);
A = 4;
B = 1;
matrix<float> truth;
alias_tensor at(5);
A = 4;
A.host();
B.host();
{
// non-aliasing test
auto aA = at(A,5);
auto aB = at(B,5);
memcpy(aA, aB);
truth = {4,4,4,4,4, 1,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
{
// aliasing test
auto aA = at(A,1);
auto aB = at(A,6);
memcpy(aA, aB);
truth = {4,1,1,1,1, 4,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
#ifdef DLIB_USE_CUDA
A = 4;
A.device();
B.host();
{
// non-aliasing test
auto aA = at(A,5);
auto aB = at(B,5);
memcpy(aA, aB);
truth = {4,4,4,4,4, 1,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
{
// aliasing test
auto aA = at(A,1);
auto aB = at(A,6);
memcpy(aA, aB);
truth = {4,1,1,1,1, 4,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
A = 4;
A.device();
B.device();
{
// non-aliasing test
auto aA = at(A,5);
auto aB = at(B,5);
memcpy(aA, aB);
truth = {4,4,4,4,4, 1,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
{
// aliasing test
auto aA = at(A,1);
auto aB = at(A,6);
memcpy(aA, aB);
truth = {4,1,1,1,1, 4,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
A = 4;
A.host();
B.device();
{
// non-aliasing test
auto aA = at(A,5);
auto aB = at(B,5);
memcpy(aA, aB);
truth = {4,4,4,4,4, 1,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
{
// aliasing test
auto aA = at(A,1);
auto aB = at(A,6);
memcpy(aA, aB);
truth = {4,1,1,1,1, 4,1,1,1,1, 4};
DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
}
#endif
}
{
resizable_tensor A, B;
A.set_size(2,3,4,5);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment