Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
InvertedListAppend.cu
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #include "InvertedListAppend.cuh"
10 #include "../../FaissAssert.h"
11 #include "../utils/Float16.cuh"
12 #include "../utils/DeviceUtils.h"
13 #include "../utils/Tensor.cuh"
14 #include "../utils/StaticUtils.h"
15 
16 namespace faiss { namespace gpu {
17 
18 __global__ void
19 runUpdateListPointers(Tensor<int, 1, true> listIds,
20  Tensor<int, 1, true> newListLength,
21  Tensor<void*, 1, true> newCodePointers,
22  Tensor<void*, 1, true> newIndexPointers,
23  int* listLengths,
24  void** listCodes,
25  void** listIndices) {
26  int index = blockIdx.x * blockDim.x + threadIdx.x;
27 
28  if (index >= listIds.getSize(0)) {
29  return;
30  }
31 
32  int listId = listIds[index];
33  listLengths[listId] = newListLength[index];
34  listCodes[listId] = newCodePointers[index];
35  listIndices[listId] = newIndexPointers[index];
36 }
37 
38 void
39 runUpdateListPointers(Tensor<int, 1, true>& listIds,
40  Tensor<int, 1, true>& newListLength,
41  Tensor<void*, 1, true>& newCodePointers,
42  Tensor<void*, 1, true>& newIndexPointers,
43  thrust::device_vector<int>& listLengths,
44  thrust::device_vector<void*>& listCodes,
45  thrust::device_vector<void*>& listIndices,
46  cudaStream_t stream) {
47  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
48  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
49 
50  dim3 grid(numBlocks);
51  dim3 block(numThreads);
52 
53  runUpdateListPointers<<<grid, block, 0, stream>>>(
54  listIds, newListLength, newCodePointers, newIndexPointers,
55  listLengths.data().get(),
56  listCodes.data().get(),
57  listIndices.data().get());
58 
59  CUDA_TEST_ERROR();
60 }
61 
62 template <IndicesOptions Opt>
63 __global__ void
64 ivfpqInvertedListAppend(Tensor<int, 1, true> listIds,
65  Tensor<int, 1, true> listOffset,
66  Tensor<int, 2, true> encodings,
67  Tensor<long, 1, true> indices,
68  void** listCodes,
69  void** listIndices) {
70  int encodingToAdd = blockIdx.x * blockDim.x + threadIdx.x;
71 
72  if (encodingToAdd >= listIds.getSize(0)) {
73  return;
74  }
75 
76  int listId = listIds[encodingToAdd];
77  int offset = listOffset[encodingToAdd];
78 
79  // Add vector could be invalid (contains NaNs etc)
80  if (listId == -1 || offset == -1) {
81  return;
82  }
83 
84  auto encoding = encodings[encodingToAdd];
85  long index = indices[encodingToAdd];
86 
87  if (Opt == INDICES_32_BIT) {
88  // FIXME: there could be overflow here, but where should we check this?
89  ((int*) listIndices[listId])[offset] = (int) index;
90  } else if (Opt == INDICES_64_BIT) {
91  ((long*) listIndices[listId])[offset] = (long) index;
92  } else {
93  // INDICES_CPU or INDICES_IVF; no indices are being stored
94  }
95 
96  unsigned char* codeStart =
97  ((unsigned char*) listCodes[listId]) + offset * encodings.getSize(1);
98 
99  // FIXME: slow
100  for (int i = 0; i < encodings.getSize(1); ++i) {
101  codeStart[i] = (unsigned char) encoding[i];
102  }
103 }
104 
105 void
106 runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
107  Tensor<int, 1, true>& listOffset,
108  Tensor<int, 2, true>& encodings,
109  Tensor<long, 1, true>& indices,
110  thrust::device_vector<void*>& listCodes,
111  thrust::device_vector<void*>& listIndices,
112  IndicesOptions indicesOptions,
113  cudaStream_t stream) {
114  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
115  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
116 
117  dim3 grid(numBlocks);
118  dim3 block(numThreads);
119 
120 #define RUN_APPEND(IND) \
121  do { \
122  ivfpqInvertedListAppend<IND><<<grid, block, 0, stream>>>( \
123  listIds, listOffset, encodings, indices, \
124  listCodes.data().get(), \
125  listIndices.data().get()); \
126  } while (0)
127 
128  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) {
129  // no need to maintain indices on the GPU
130  RUN_APPEND(INDICES_IVF);
131  } else if (indicesOptions == INDICES_32_BIT) {
132  RUN_APPEND(INDICES_32_BIT);
133  } else if (indicesOptions == INDICES_64_BIT) {
134  RUN_APPEND(INDICES_64_BIT);
135  } else {
136  // unknown index storage type
137  FAISS_ASSERT(false);
138  }
139 
140  CUDA_TEST_ERROR();
141 
142 #undef RUN_APPEND
143 }
144 
145 template <IndicesOptions Opt, bool Exact, bool Float16>
146 __global__ void
147 ivfFlatInvertedListAppend(Tensor<int, 1, true> listIds,
148  Tensor<int, 1, true> listOffset,
149  Tensor<float, 2, true> vecs,
150  Tensor<long, 1, true> indices,
151  void** listData,
152  void** listIndices) {
153  int vec = blockIdx.x;
154 
155  int listId = listIds[vec];
156  int offset = listOffset[vec];
157 
158  // Add vector could be invalid (contains NaNs etc)
159  if (listId == -1 || offset == -1) {
160  return;
161  }
162 
163  if (threadIdx.x == 0) {
164  long index = indices[vec];
165 
166  if (Opt == INDICES_32_BIT) {
167  // FIXME: there could be overflow here, but where should we check this?
168  ((int*) listIndices[listId])[offset] = (int) index;
169  } else if (Opt == INDICES_64_BIT) {
170  ((long*) listIndices[listId])[offset] = (long) index;
171  } else {
172  // INDICES_CPU or INDICES_IVF; no indices are being stored
173  }
174  }
175 
176 #ifdef FAISS_USE_FLOAT16
177  // FIXME: should use half2 for better memory b/w
178  if (Float16) {
179  half* vecStart = ((half*) listData[listId]) + offset * vecs.getSize(1);
180 
181  if (Exact) {
182  vecStart[threadIdx.x] = __float2half(vecs[vec][threadIdx.x]);
183  } else {
184  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
185  vecStart[i] = __float2half(vecs[vec][i]);
186  }
187  }
188  }
189 #else
190  static_assert(!Float16, "float16 unsupported");
191 #endif
192 
193  if (!Float16) {
194  float* vecStart = ((float*) listData[listId]) + offset * vecs.getSize(1);
195 
196  if (Exact) {
197  vecStart[threadIdx.x] = vecs[vec][threadIdx.x];
198  } else {
199  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
200  vecStart[i] = vecs[vec][i];
201  }
202  }
203  }
204 }
205 
206 void
207 runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
208  Tensor<int, 1, true>& listOffset,
209  Tensor<float, 2, true>& vecs,
210  Tensor<long, 1, true>& indices,
211  bool useFloat16,
212  thrust::device_vector<void*>& listData,
213  thrust::device_vector<void*>& listIndices,
214  IndicesOptions indicesOptions,
215  cudaStream_t stream) {
216  int maxThreads = getMaxThreadsCurrentDevice();
217  bool exact = vecs.getSize(1) <= maxThreads;
218 
219  // Each block will handle appending a single vector
220  dim3 grid(vecs.getSize(0));
221  dim3 block(std::min(vecs.getSize(1), maxThreads));
222 
223 #define RUN_APPEND_OPT(OPT, EXACT, FLOAT16) \
224  do { \
225  ivfFlatInvertedListAppend<OPT, EXACT, FLOAT16> \
226  <<<grid, block, 0, stream>>>( \
227  listIds, listOffset, vecs, indices, \
228  listData.data().get(), \
229  listIndices.data().get()); \
230  } while (0) \
231 
232 #define RUN_APPEND(EXACT, FLOAT16) \
233  do { \
234  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) { \
235  /* no indices are maintained on the GPU */ \
236  RUN_APPEND_OPT(INDICES_IVF, EXACT, FLOAT16); \
237  } else if (indicesOptions == INDICES_32_BIT) { \
238  RUN_APPEND_OPT(INDICES_32_BIT, EXACT, FLOAT16); \
239  } else if (indicesOptions == INDICES_64_BIT) { \
240  RUN_APPEND_OPT(INDICES_64_BIT, EXACT, FLOAT16); \
241  } else { \
242  FAISS_ASSERT(false); \
243  } \
244  } while (0);
245 
246  if (useFloat16) {
247 #ifdef FAISS_USE_FLOAT16
248  if (exact) {
249  RUN_APPEND(true, true);
250  } else {
251  RUN_APPEND(false, true);
252  }
253 #else
254  // no float16 support
255  FAISS_ASSERT(false);
256 #endif
257  } else {
258  if (exact) {
259  RUN_APPEND(true, false);
260  } else {
261  RUN_APPEND(false, false);
262  }
263  }
264 
265  CUDA_TEST_ERROR();
266 
267 #undef RUN_APPEND
268 #undef RUN_APPEND_OPT
269 }
270 
271 } } // namespace