Commit 59892409 authored by Fm's avatar Fm

depth layer: cuda concat/split moved to cpu/cuda files

parent 28c4a482
......@@ -1741,6 +1741,57 @@ namespace dlib
}
}
// ------------------------------------------------------------------------------------
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size >= src_sample_size + sample_offset, "Not enough space in dest tensor");
float* dest_p = dest.host_write_only() + sample_offset;
const float* src_p = src.host();
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
::memcpy(dest_p, src_p, src_sample_size * sizeof(float));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(),
"All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size <= src_sample_size - sample_offset, "Not enough space in dest tensor");
float *dest_p = dest.host_write_only();
const float *src_p = src.host() + sample_offset;
for (unsigned long i = 0; i < src.num_samples(); ++i) {
::memcpy(dest_p, src_p, dest_sample_size * sizeof(float));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
......
......@@ -364,6 +364,19 @@ namespace dlib
long last_padding_x;
};
// ----------------------------------------------------------------------------------------
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
// -----------------------------------------------------------------------------------
}
......
......@@ -761,6 +761,56 @@ namespace dlib
// ----------------------------------------------------------------------------------------
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size >= src_sample_size + sample_offset, "Not enough space in dest tensor");
float* dest_p = dest.device_write_only() + sample_offset;
const float* src_p = src.device();
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
CHECK_CUDA(cudaMemcpy(dest_p, src_p, src_sample_size * sizeof(float), cudaMemcpyDeviceToDevice));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(),
"All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size <= src_sample_size - sample_offset, "Not enough space in dest tensor");
float *dest_p = dest.device_write_only();
const float *src_p = src.device() + sample_offset;
for (unsigned long i = 0; i < src.num_samples(); ++i) {
CHECK_CUDA(cudaMemcpy(dest_p, src_p, dest_sample_size * sizeof(float), cudaMemcpyDeviceToDevice));
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
// ----------------------------------------------------------------------------------------
}
}
......@@ -244,6 +244,17 @@ namespace dlib
tensor& params_grad
);
void concat_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
void split_depth(
tensor& dest,
size_t sample_offset,
const tensor& src
);
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
......
......@@ -639,60 +639,20 @@ namespace dlib { namespace tt
void concat_depth(tensor& dest, size_t sample_offset, const tensor& src)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size >= src_sample_size + sample_offset, "Not enough space in dest tensor");
#ifdef DLIB_USE_CUDA
float* dest_p = dest.device_write_only() + sample_offset;
const float* src_p = src.device();
#else
float* dest_p = dest.host_write_only() + sample_offset;
const float* src_p = src.host();
#endif
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
#ifdef DLIB_USE_CUDA
CHECK_CUDA(cudaMemcpy(dest_p, src_p, src_sample_size * sizeof(float), cudaMemcpyDeviceToDevice));
cuda::concat_depth(dest, sample_offset, src);
#else
::memcpy(dest_p, src_p, src_sample_size * sizeof(float));
cpu::concat_depth(dest, sample_offset, src);
#endif
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
void split_depth(tensor& dest, size_t sample_offset, const tensor& src)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest_sample_size <= src_sample_size - sample_offset, "Not enough space in dest tensor");
#ifdef DLIB_USE_CUDA
float* dest_p = dest.device_write_only();
const float* src_p = src.device() + sample_offset;
#else
float* dest_p = dest.host_write_only();
const float* src_p = src.host() + sample_offset;
#endif
for (unsigned long i = 0; i < src.num_samples(); ++i)
{
#ifdef DLIB_USE_CUDA
CHECK_CUDA(cudaMemcpy(dest_p, src_p, dest_sample_size * sizeof(float), cudaMemcpyDeviceToDevice));
cuda::split_depth(dest, sample_offset, src);
#else
::memcpy(dest_p, src_p, dest_sample_size * sizeof(float));
cpu::split_depth(dest, sample_offset, src);
#endif
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
// ----------------------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment