Commit 89c9267e authored by Davis King's avatar Davis King

Made copy_tensor() use cudaMemcpyAsync() rather than cudaMemcpy().

parent aafa4116
...@@ -1401,7 +1401,7 @@ namespace dlib ...@@ -1401,7 +1401,7 @@ namespace dlib
for (long i = 0; i < src.num_samples(); ++i) for (long i = 0; i < src.num_samples(); ++i)
{ {
CHECK_CUDA(cudaMemcpy(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice)); CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
dest_p += dest_sample_size; dest_p += dest_sample_size;
src_p += src_sample_size; src_p += src_sample_size;
......
...@@ -1558,6 +1558,7 @@ namespace dlib { namespace tt ...@@ -1558,6 +1558,7 @@ namespace dlib { namespace tt
- dest.k() - dest_k_offset >= count_k - dest.k() - dest_k_offset >= count_k
- src.k() - src_k_offset >= count_k - src.k() - src_k_offset >= count_k
- is_same_object(dest,src) == false - is_same_object(dest,src) == false
- The memory areas of src and dest do not overlap.
ensures ensures
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k] - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
Copies content of each sample from src in to corresponding place of sample at dest. Copies content of each sample from src in to corresponding place of sample at dest.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment