Commit 59892409 authored by Fm's avatar Fm

depth layer: cuda concat/split moved to cpu/cuda files

parent 28c4a482
...@@ -1741,6 +1741,57 @@ namespace dlib ...@@ -1741,6 +1741,57 @@ namespace dlib
} }
} }
// ------------------------------------------------------------------------------------
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size >= src_sample_size + sample_offset, "Not enough space in dest tensor");
float* dest_p = dest.host_write_only() + sample_offset;
const float* src_p = src.host();
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
::memcpy(dest_p, src_p, src_sample_size * sizeof(float));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(),
"All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size <= src_sample_size - sample_offset, "Not enough space in dest tensor");
float *dest_p = dest.host_write_only();
const float *src_p = src.host() + sample_offset;
for (unsigned long i = 0; i < src.num_samples(); ++i) {
::memcpy(dest_p, src_p, dest_sample_size * sizeof(float));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
......
...@@ -364,6 +364,19 @@ namespace dlib ...@@ -364,6 +364,19 @@ namespace dlib
long last_padding_x; long last_padding_x;
}; };
// ----------------------------------------------------------------------------------------
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
} }
......
...@@ -761,6 +761,56 @@ namespace dlib ...@@ -761,6 +761,56 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size >= src_sample_size + sample_offset, "Not enough space in dest tensor");
float* dest_p = dest.device_write_only() + sample_offset;
const float* src_p = src.device();
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
CHECK_CUDA(cudaMemcpy(dest_p, src_p, src_sample_size * sizeof(float), cudaMemcpyDeviceToDevice));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(),
"All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size <= src_sample_size - sample_offset, "Not enough space in dest tensor");
float *dest_p = dest.device_write_only();
const float *src_p = src.device() + sample_offset;
for (unsigned long i = 0; i < src.num_samples(); ++i) {
CHECK_CUDA(cudaMemcpy(dest_p, src_p, dest_sample_size * sizeof(float), cudaMemcpyDeviceToDevice));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
// ----------------------------------------------------------------------------------------
} }
} }
...@@ -244,6 +244,17 @@ namespace dlib ...@@ -244,6 +244,17 @@ namespace dlib
tensor& params_grad tensor& params_grad
); );
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
......
...@@ -639,60 +639,20 @@ namespace dlib { namespace tt ...@@ -639,60 +639,20 @@ namespace dlib { namespace tt
void concat_depth(tensor& dest, size_t sample_offset, const tensor& src) void concat_depth(tensor& dest, size_t sample_offset, const tensor& src)
{ {
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size >= src_sample_size + sample_offset, "Not enough space in dest tensor");
#ifdef DLIB_USE_CUDA
float* dest_p = dest.device_write_only() + sample_offset;
const float* src_p = src.device();
#else
float* dest_p = dest.host_write_only() + sample_offset;
const float* src_p = src.host();
#endif
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
CHECK_CUDA(cudaMemcpy(dest_p, src_p, src_sample_size * sizeof(float), cudaMemcpyDeviceToDevice)); cuda::concat_depth(dest, sample_offset, src);
#else #else
::memcpy(dest_p, src_p, src_sample_size * sizeof(float)); cpu::concat_depth(dest, sample_offset, src);
#endif #endif
dest_p += dest_sample_size;
src_p += src_sample_size;
}
} }
void split_depth(tensor& dest, size_t sample_offset, const tensor& src) void split_depth(tensor& dest, size_t sample_offset, const tensor& src)
{ {
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size <= src_sample_size - sample_offset, "Not enough space in dest tensor");
#ifdef DLIB_USE_CUDA
float* dest_p = dest.device_write_only();
const float* src_p = src.device() + sample_offset;
#else
float* dest_p = dest.host_write_only();
const float* src_p = src.host() + sample_offset;
#endif
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
CHECK_CUDA(cudaMemcpy(dest_p, src_p, dest_sample_size * sizeof(float), cudaMemcpyDeviceToDevice)); cuda::split_depth(dest, sample_offset, src);
#else #else
::memcpy(dest_p, src_p, dest_sample_size * sizeof(float)); cpu::split_depth(dest, sample_offset, src);
#endif #endif
dest_p += dest_sample_size;
src_p += src_sample_size;
}
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment