Commit 3ba21975 authored by Davis King's avatar Davis King

Moved gpu_data into its own file, fixed a few bugs, and cleaned up

a few things.
parent 0508fe2b
...@@ -441,6 +441,7 @@ if (NOT TARGET dlib) ...@@ -441,6 +441,7 @@ if (NOT TARGET dlib)
dnn/cuda_dlib.cu dnn/cuda_dlib.cu
dnn/cudnn_dlibapi.cpp dnn/cudnn_dlibapi.cpp
dnn/cublas_dlibapi.cpp dnn/cublas_dlibapi.cpp
dnn/gpu_data.cpp
) )
set(dlib_needed_libraries ${dlib_needed_libraries} ${CUDA_CUBLAS_LIBRARIES} ${cudnn}) set(dlib_needed_libraries ${dlib_needed_libraries} ${CUDA_CUBLAS_LIBRARIES} ${cudnn})
include_directories(${cudnn_include}) include_directories(${cudnn_include})
......
...@@ -29,8 +29,19 @@ namespace dlib ...@@ -29,8 +29,19 @@ namespace dlib
cublas_context(const cublas_context&) = delete; cublas_context(const cublas_context&) = delete;
cublas_context& operator=(const cublas_context&) = delete; cublas_context& operator=(const cublas_context&) = delete;
// but is movable // but is movable
cublas_context(cublas_context&&) = default; cublas_context(cublas_context&& item)
cublas_context& operator=(cublas_context&&) = default; {
handle = item.handle;
item.handle = nullptr;
}
cublas_context& operator=(cublas_context&& item)
{
if (this == &item)
return *this;
handle = item.handle;
item.handle = nullptr;
return *this;
}
cublas_context(); cublas_context();
~cublas_context(); ~cublas_context();
......
...@@ -3,14 +3,11 @@ ...@@ -3,14 +3,11 @@
#ifndef DLIB_CUDA_ERRORs_H_ #ifndef DLIB_CUDA_ERRORs_H_
#define DLIB_CUDA_ERRORs_H_ #define DLIB_CUDA_ERRORs_H_
#ifdef DLIB_USE_CUDA
#include "../error.h" #include "../error.h"
namespace dlib namespace dlib
{ {
namespace cuda
{
struct cuda_error : public error struct cuda_error : public error
{ {
cuda_error(const std::string& message): error(message) {} cuda_error(const std::string& message): error(message) {}
...@@ -20,10 +17,8 @@ namespace dlib ...@@ -20,10 +17,8 @@ namespace dlib
{ {
cudnn_error(const std::string& message): cuda_error(message) {} cudnn_error(const std::string& message): cuda_error(message) {}
}; };
}
} }
#endif // DLIB_USE_CUDA
#endif // DLIB_CUDA_ERRORs_H_ #endif // DLIB_CUDA_ERRORs_H_
...@@ -3,10 +3,13 @@ ...@@ -3,10 +3,13 @@
#ifndef DLIB_CUDA_UtILS_H_ #ifndef DLIB_CUDA_UtILS_H_
#define DLIB_CUDA_UtILS_H_ #define DLIB_CUDA_UtILS_H_
#ifndef DLIB_USE_CUDA
#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
#endif
#include "cuda_errors.h" #include "cuda_errors.h"
#include <cuda.h> #include <cuda_runtime.h>
#include <sstream> #include <sstream>
...@@ -19,7 +22,7 @@ ...@@ -19,7 +22,7 @@
std::ostringstream sout; \ std::ostringstream sout; \
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\ sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\ sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
throw dlib::cuda::cuda_error(sout.str()); \ throw dlib::cuda_error(sout.str()); \
} \ } \
} }
...@@ -27,15 +30,19 @@ ...@@ -27,15 +30,19 @@
#ifdef __CUDACC__ #ifdef __CUDACC__
class grid_stride_range namespace dlib
{ {
namespace cuda
{
class grid_stride_range
{
/*! /*!
WHAT THIS OBJECT REPRESENTS WHAT THIS OBJECT REPRESENTS
This is a tool for making a for loop that loops over an entire block of memory This is a tool for making a for loop that loops over an entire block of
inside a kernel, but doing so in a way that parallelizes appropriately across memory inside a kernel, but doing so in a way that parallelizes
all the threads in a kernel launch. For example, the following kernel would appropriately across all the threads in a kernel launch. For example,
add the vector a to the vector b and store the output in out (assuming all the following kernel would add the vector a to the vector b and store
vectors are of dimension n): the output in out (assuming all vectors are of dimension n):
__global__ void add_arrays( __global__ void add_arrays(
const float* a, const float* a,
const float* b, const float* b,
...@@ -50,7 +57,7 @@ class grid_stride_range ...@@ -50,7 +57,7 @@ class grid_stride_range
} }
!*/ !*/
public: public:
__device__ grid_stride_range( __device__ grid_stride_range(
size_t ibegin_, size_t ibegin_,
size_t iend_ size_t iend_
...@@ -91,11 +98,14 @@ public: ...@@ -91,11 +98,14 @@ public:
{ {
return iterator(iend); return iterator(iend);
} }
private: private:
size_t ibegin; size_t ibegin;
size_t iend; size_t iend;
}; };
}
}
#endif // __CUDACC__ #endif // __CUDACC__
......
...@@ -15,133 +15,6 @@ ...@@ -15,133 +15,6 @@
namespace dlib namespace dlib
{ {
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// gpu_data member functions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// TODO, add error handling
void gpu_data::
wait_for_transfer_to_finish() const
{
if (have_active_transfer)
{
std::cout << "wait for cudaStreamSynchronize()" << std::endl;
CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
have_active_transfer = false;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_device() const
{
wait_for_transfer_to_finish();
if (!device_current)
{
std::cout << "cudaMemcpy to device" << std::endl;
CHECK_CUDA(cudaMemcpy(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice));
device_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_host() const
{
wait_for_transfer_to_finish();
if (!host_current)
{
std::cout << "cudaMemcpy to host" << std::endl;
CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
host_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
async_copy_to_device()
{
if (!device_current)
{
std::cout << "cudaMemcpyAsync to device" << std::endl;
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
have_active_transfer = true;
device_current = true;
}
}
void gpu_data::
set_size(
size_t new_size
)
{
wait_for_transfer_to_finish();
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
try
{
void* data;
CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
// Note that we don't throw exceptions since the free calls are invariably
// called in destructors. They also shouldn't fail anyway unless someone
// is resetting the GPU card in the middle of their program.
data_host.reset((float*)data, [](float* ptr){
auto err = cudaFreeHost(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
data_device.reset((float*)data, [](float* ptr){
auto err = cudaFree(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
if (!cuda_stream)
{
cudaStream_t cstream;
CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
cuda_stream.reset(cstream, [](void* ptr){
auto err = cudaStreamDestroy((cudaStream_t)ptr);
if(err!=cudaSuccess)
std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
}
}
catch(...)
{
set_size(0);
throw;
}
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
namespace cuda namespace cuda
{ {
...@@ -155,6 +28,8 @@ namespace dlib ...@@ -155,6 +28,8 @@ namespace dlib
throw cudnn_error("CUDA Runtime API initialization failed."); throw cudnn_error("CUDA Runtime API initialization failed.");
case CUDNN_STATUS_ALLOC_FAILED: case CUDNN_STATUS_ALLOC_FAILED:
throw cudnn_error("CUDA Resources could not be allocated."); throw cudnn_error("CUDA Resources could not be allocated.");
case CUDNN_STATUS_BAD_PARAM:
throw cudnn_error("CUDNN_STATUS_BAD_PARAM");
default: default:
throw cudnn_error("A call to cuDNN failed."); throw cudnn_error("A call to cuDNN failed.");
} }
...@@ -180,20 +55,16 @@ namespace dlib ...@@ -180,20 +55,16 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
tensor_descriptor::tensor_descriptor() : handle(nullptr) tensor_descriptor::
tensor_descriptor(
) : handle(nullptr)
{ {
cudnnTensorDescriptor_t h;
check(cudnnCreateTensorDescriptor(&h));
handle = h;
} }
tensor_descriptor::~tensor_descriptor() tensor_descriptor::
{ ~tensor_descriptor()
if (handle)
{ {
cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle); set_size(0,0,0,0);
handle = nullptr;
}
} }
void tensor_descriptor:: void tensor_descriptor::
...@@ -204,6 +75,20 @@ namespace dlib ...@@ -204,6 +75,20 @@ namespace dlib
int k int k
) )
{ {
if (n == 0 || nr == 0 || nc == 0 || k == 0)
{
if (handle)
{
cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle);
handle = nullptr;
}
}
else
{
cudnnTensorDescriptor_t h;
check(cudnnCreateTensorDescriptor(&h));
handle = h;
check(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle, check(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT,
...@@ -212,6 +97,7 @@ namespace dlib ...@@ -212,6 +97,7 @@ namespace dlib
nr, nr,
nc)); nc));
} }
}
void tensor_descriptor:: void tensor_descriptor::
get_size ( get_size (
...@@ -220,6 +106,8 @@ namespace dlib ...@@ -220,6 +106,8 @@ namespace dlib
int& nc, int& nc,
int& k int& k
) const ) const
{
if (handle)
{ {
int nStride, cStride, hStride, wStride; int nStride, cStride, hStride, wStride;
cudnnDataType_t datatype; cudnnDataType_t datatype;
...@@ -234,6 +122,14 @@ namespace dlib ...@@ -234,6 +122,14 @@ namespace dlib
&hStride, &hStride,
&wStride)); &wStride));
} }
else
{
n = 0;
nr = 0;
nc = 0;
k = 0;
}
}
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
......
...@@ -25,8 +25,19 @@ namespace dlib ...@@ -25,8 +25,19 @@ namespace dlib
cudnn_context(const cudnn_context&) = delete; cudnn_context(const cudnn_context&) = delete;
cudnn_context& operator=(const cudnn_context&) = delete; cudnn_context& operator=(const cudnn_context&) = delete;
// but is movable // but is movable
cudnn_context(cudnn_context&&) = default; cudnn_context(cudnn_context&& item)
cudnn_context& operator=(cudnn_context&&) = default; {
handle = item.handle;
item.handle = nullptr;
}
cudnn_context& operator=(cudnn_context&& item)
{
if (this == &item)
return *this;
handle = item.handle;
item.handle = nullptr;
return *this;
}
cudnn_context(); cudnn_context();
~cudnn_context(); ~cudnn_context();
...@@ -53,8 +64,19 @@ namespace dlib ...@@ -53,8 +64,19 @@ namespace dlib
tensor_descriptor(const tensor_descriptor&) = delete; tensor_descriptor(const tensor_descriptor&) = delete;
tensor_descriptor& operator=(const tensor_descriptor&) = delete; tensor_descriptor& operator=(const tensor_descriptor&) = delete;
// but is movable // but is movable
tensor_descriptor(tensor_descriptor&&) = default; tensor_descriptor(tensor_descriptor&& item)
tensor_descriptor& operator=(tensor_descriptor&&) = default; {
handle = item.handle;
item.handle = nullptr;
}
tensor_descriptor& operator=(tensor_descriptor&& item)
{
if (this == &item)
return *this;
handle = item.handle;
item.handle = nullptr;
return *this;
}
tensor_descriptor(); tensor_descriptor();
~tensor_descriptor(); ~tensor_descriptor();
......
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_GPU_DaTA_CPP_
#define DLIB_GPU_DaTA_CPP_
// Only things that require CUDA are declared in this cpp file. Everything else is in the
// gpu_data.h header so that it can operate as "header-only" code when using just the CPU.
#ifdef DLIB_USE_CUDA
#include "gpu_data.h"
#include <iostream>
#include "cuda_utils.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
void gpu_data::
wait_for_transfer_to_finish() const
{
if (have_active_transfer)
{
std::cout << "wait for cudaStreamSynchronize()" << std::endl;
CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
have_active_transfer = false;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_device() const
{
wait_for_transfer_to_finish();
if (!device_current)
{
std::cout << "cudaMemcpy to device" << std::endl;
CHECK_CUDA(cudaMemcpy(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice));
device_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_host() const
{
wait_for_transfer_to_finish();
if (!host_current)
{
std::cout << "cudaMemcpy to host" << std::endl;
CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
host_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
async_copy_to_device()
{
if (!device_current)
{
std::cout << "cudaMemcpyAsync to device" << std::endl;
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
have_active_transfer = true;
device_current = true;
}
}
void gpu_data::
set_size(
size_t new_size
)
{
wait_for_transfer_to_finish();
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
try
{
void* data;
CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
// Note that we don't throw exceptions since the free calls are invariably
// called in destructors. They also shouldn't fail anyway unless someone
// is resetting the GPU card in the middle of their program.
data_host.reset((float*)data, [](float* ptr){
auto err = cudaFreeHost(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
data_device.reset((float*)data, [](float* ptr){
auto err = cudaFree(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
if (!cuda_stream)
{
cudaStream_t cstream;
CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
cuda_stream.reset(cstream, [](void* ptr){
auto err = cudaStreamDestroy((cudaStream_t)ptr);
if(err!=cudaSuccess)
std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
}
}
catch(...)
{
set_size(0);
throw;
}
}
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_USE_CUDA
#endif // DLIB_GPU_DaTA_CPP_
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_GPU_DaTA_H_
#define DLIB_GPU_DaTA_H_
#include <memory>
#include "cuda_errors.h"
#include "../serialize.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class gpu_data
{
/*!
CONVENTION
- if (size() != 0) then
- data_host == a pointer to size() floats in CPU memory.
- if (data_device) then
- data_device == a pointer to size() floats in device memory.
- if (there might be an active transfer between host and device) then
- have_active_transfer == true
- We use the host_current and device_current bools to keep track of which
copy of the data (or both) are most current. e.g. if the CPU has
modified the tensor and it hasn't been copied to the device yet then
host_current==true and device_current == false.
THREAD SAFETY
This object is not thread-safe. Don't touch it from multiple threads as the
same time.
!*/
public:
gpu_data(
) : data_size(0), host_current(true), device_current(true),have_active_transfer(false)
{
}
// Not copyable
gpu_data(const gpu_data&) = delete;
gpu_data& operator=(const gpu_data&) = delete;
// but is movable
gpu_data(gpu_data&&) = default;
gpu_data& operator=(gpu_data&&) = default;
#ifdef DLIB_USE_CUDA
void async_copy_to_device();
void set_size(size_t new_size);
#else
// Note that calls to host() or device() will block until any async transfers are complete.
void async_copy_to_device(){}
void set_size(size_t new_size)
{
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
data_host.reset(new float[new_size], std::default_delete<float[]>());
data_device.reset();
}
}
#endif
const float* host() const
{
copy_to_host();
return data_host.get();
}
float* host()
{
copy_to_host();
device_current = false;
return data_host.get();
}
const float* device() const
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
return data_device.get();
}
float* device()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
host_current = false;
return data_device.get();
}
size_t size() const { return data_size; }
private:
#ifdef DLIB_USE_CUDA
void copy_to_device() const;
void copy_to_host() const;
void wait_for_transfer_to_finish() const;
#else
void copy_to_device() const{}
void copy_to_host() const{}
void wait_for_transfer_to_finish() const{}
#endif
size_t data_size;
mutable bool host_current;
mutable bool device_current;
mutable bool have_active_transfer;
std::shared_ptr<float> data_host;
std::shared_ptr<float> data_device;
std::shared_ptr<void> cuda_stream;
};
inline void serialize(const gpu_data& item, std::ostream& out)
{
int version = 1;
serialize(version, out);
serialize(item.size(), out);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
serialize(data[i], out);
}
inline void deserialize(gpu_data& item, std::istream& in)
{
int version;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
size_t s;
deserialize(s, in);
item.set_size(s);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
deserialize(data[i], in);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_GPU_DaTA_H_
...@@ -3,165 +3,13 @@ ...@@ -3,165 +3,13 @@
#ifndef DLIB_DNn_TENSOR_H_ #ifndef DLIB_DNn_TENSOR_H_
#define DLIB_DNn_TENSOR_H_ #define DLIB_DNn_TENSOR_H_
#include <memory>
#include <cstring> #include <cstring>
#include "../matrix.h" #include "../matrix.h"
#include "cudnn_dlibapi.h" #include "cudnn_dlibapi.h"
#include "gpu_data.h"
namespace dlib namespace dlib
{ {
// ----------------------------------------------------------------------------------------
class gpu_data
{
/*!
CONVENTION
- if (size() != 0) then
- data_host == a pointer to size() floats in CPU memory.
- if (data_device) then
- data_device == a pointer to size() floats in device memory.
- if (there might be an active transfer between host and device) then
- have_active_transfer == true
- We use the host_current and device_current bools to keep track of which
copy of the data (or both) are most current. e.g. if the CPU has
modified the tensor and it hasn't been copied to the device yet then
host_current==true and device_current == false.
THREAD SAFETY
This object is not thread-safe. Don't touch it from multiple threads as the
same time.
!*/
public:
gpu_data(
) : data_size(0), host_current(true), device_current(true),have_active_transfer(false)
{
}
// Not copyable
gpu_data(const gpu_data&) = delete;
gpu_data& operator=(const gpu_data&) = delete;
// but is movable
gpu_data(gpu_data&&) = default;
gpu_data& operator=(gpu_data&&) = default;
#ifdef DLIB_USE_CUDA
void async_copy_to_device();
void set_size(size_t new_size);
#else
// Note that calls to host() or device() will block until any async transfers are complete.
void async_copy_to_device(){}
void set_size(size_t new_size)
{
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
data_host.reset(new float[new_size], std::default_delete<float[]>());
data_device.reset();
}
}
#endif
const float* host() const
{
copy_to_host();
return data_host.get();
}
float* host()
{
copy_to_host();
device_current = false;
return data_host.get();
}
const float* device() const
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
return data_device.get();
}
float* device()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
host_current = false;
return data_device.get();
}
size_t size() const { return data_size; }
private:
#ifdef DLIB_USE_CUDA
void copy_to_device() const;
void copy_to_host() const;
void wait_for_transfer_to_finish() const;
#else
void copy_to_device() const{}
void copy_to_host() const{}
void wait_for_transfer_to_finish() const{}
#endif
size_t data_size;
mutable bool host_current;
mutable bool device_current;
mutable bool have_active_transfer;
std::shared_ptr<float> data_host;
std::shared_ptr<float> data_device;
std::shared_ptr<void> cuda_stream;
};
inline void serialize(const gpu_data& item, std::ostream& out)
{
int version = 1;
serialize(version, out);
serialize(item.size(), out);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
serialize(data[i], out);
}
inline void deserialize(gpu_data& item, std::istream& in)
{
int version;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
size_t s;
deserialize(s, in);
item.set_size(s);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
deserialize(data[i], in);
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
class tensor class tensor
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment