1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD+Patents license found in the
* LICENSE file in the root directory of this source tree.
*/
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "../../FaissAssert.h"
#include "DeviceUtils.h"
#include "MemorySpace.h"
#include "StaticUtils.h"
#include <algorithm>
#include <cuda.h>
#include <vector>
namespace faiss { namespace gpu {
/// A simple version of thrust::device_vector<T>, but has more control
/// over whether resize() initializes new space with T() (which we
/// don't want), and control on how much the reserved space grows by
/// upon resize/reserve. It is also meant for POD types only.
template <typename T>
class DeviceVector {
public:
DeviceVector(MemorySpace space = MemorySpace::Device)
: data_(nullptr),
num_(0),
capacity_(0),
space_(space) {
}
~DeviceVector() {
clear();
}
// Clear all allocated memory; reset to zero size
void clear() {
CUDA_VERIFY(cudaFree(data_));
data_ = nullptr;
num_ = 0;
capacity_ = 0;
}
size_t size() const { return num_; }
size_t capacity() const { return capacity_; }
T* data() { return data_; }
const T* data() const { return data_; }
template <typename OutT>
std::vector<OutT> copyToHost(cudaStream_t stream) const {
FAISS_ASSERT(num_ * sizeof(T) % sizeof(OutT) == 0);
std::vector<OutT> out((num_ * sizeof(T)) / sizeof(OutT));
CUDA_VERIFY(cudaMemcpyAsync(out.data(), data_, num_ * sizeof(T),
cudaMemcpyDeviceToHost, stream));
return out;
}
// Returns true if we actually reallocated memory
// If `reserveExact` is true, then we reserve only the memory that
// we need for what we're appending
bool append(const T* d,
size_t n,
cudaStream_t stream,
bool reserveExact = false) {
bool mem = false;
if (n > 0) {
size_t reserveSize = num_ + n;
if (!reserveExact) {
reserveSize = getNewCapacity_(reserveSize);
}
mem = reserve(reserveSize, stream);
int dev = getDeviceForAddress(d);
if (dev == -1) {
CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
cudaMemcpyHostToDevice, stream));
} else {
CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
cudaMemcpyDeviceToDevice, stream));
}
num_ += n;
}
return mem;
}
// Returns true if we actually reallocated memory
bool resize(size_t newSize, cudaStream_t stream) {
bool mem = false;
if (num_ < newSize) {
mem = reserve(getNewCapacity_(newSize), stream);
}
// Don't bother zero initializing the newly accessible memory
// (unlike thrust::device_vector)
num_ = newSize;
return mem;
}
// Clean up after oversized allocations, while leaving some space to
// remain for subsequent allocations (if `exact` false) or to
// exactly the space we need (if `exact` true); returns space
// reclaimed in bytes
size_t reclaim(bool exact, cudaStream_t stream) {
size_t free = capacity_ - num_;
if (exact) {
realloc_(num_, stream);
return free * sizeof(T);
}
// If more than 1/4th of the space is free, then we want to
// truncate to only having 1/8th of the space free; this still
// preserves some space for new elements, but won't force us to
// double our size right away
if (free > (capacity_ / 4)) {
size_t newFree = capacity_ / 8;
size_t newCapacity = num_ + newFree;
size_t oldCapacity = capacity_;
FAISS_ASSERT(newCapacity < oldCapacity);
realloc_(newCapacity, stream);
return (oldCapacity - newCapacity) * sizeof(T);
}
return 0;
}
// Returns true if we actually reallocated memory
bool reserve(size_t newCapacity, cudaStream_t stream) {
if (newCapacity <= capacity_) {
return false;
}
// Otherwise, we need new space.
realloc_(newCapacity, stream);
return true;
}
private:
void realloc_(size_t newCapacity, cudaStream_t stream) {
FAISS_ASSERT(num_ <= newCapacity);
T* newData = nullptr;
allocMemorySpace(space_, (void**) &newData, newCapacity * sizeof(T));
CUDA_VERIFY(cudaMemcpyAsync(newData, data_, num_ * sizeof(T),
cudaMemcpyDeviceToDevice, stream));
// FIXME: keep on reclamation queue to avoid hammering cudaFree?
CUDA_VERIFY(cudaFree(data_));
data_ = newData;
capacity_ = newCapacity;
}
size_t getNewCapacity_(size_t preferredSize) {
return utils::nextHighestPowerOf2(preferredSize);
}
T* data_;
size_t num_;
size_t capacity_;
MemorySpace space_;
};
} } // namespace