Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
InvertedListAppend.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "InvertedListAppend.cuh"
12 #include "../../FaissAssert.h"
13 #include "../utils/Float16.cuh"
14 #include "../utils/DeviceUtils.h"
15 #include "../utils/Tensor.cuh"
16 #include "../utils/StaticUtils.h"
17 
18 namespace faiss { namespace gpu {
19 
20 __global__ void
21 runUpdateListPointers(Tensor<int, 1, true> listIds,
22  Tensor<int, 1, true> newListLength,
23  Tensor<void*, 1, true> newCodePointers,
24  Tensor<void*, 1, true> newIndexPointers,
25  int* listLengths,
26  void** listCodes,
27  void** listIndices) {
28  int index = blockIdx.x * blockDim.x + threadIdx.x;
29 
30  if (index >= listIds.getSize(0)) {
31  return;
32  }
33 
34  int listId = listIds[index];
35  listLengths[listId] = newListLength[index];
36  listCodes[listId] = newCodePointers[index];
37  listIndices[listId] = newIndexPointers[index];
38 }
39 
40 void
41 runUpdateListPointers(Tensor<int, 1, true>& listIds,
42  Tensor<int, 1, true>& newListLength,
43  Tensor<void*, 1, true>& newCodePointers,
44  Tensor<void*, 1, true>& newIndexPointers,
45  thrust::device_vector<int>& listLengths,
46  thrust::device_vector<void*>& listCodes,
47  thrust::device_vector<void*>& listIndices,
48  cudaStream_t stream) {
49  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
50  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
51 
52  dim3 grid(numBlocks);
53  dim3 block(numThreads);
54 
55  runUpdateListPointers<<<grid, block, 0, stream>>>(
56  listIds, newListLength, newCodePointers, newIndexPointers,
57  listLengths.data().get(),
58  listCodes.data().get(),
59  listIndices.data().get());
60 
61  CUDA_TEST_ERROR();
62 }
63 
64 template <IndicesOptions Opt>
65 __global__ void
66 ivfpqInvertedListAppend(Tensor<int, 1, true> listIds,
67  Tensor<int, 1, true> listOffset,
68  Tensor<int, 2, true> encodings,
69  Tensor<long, 1, true> indices,
70  void** listCodes,
71  void** listIndices) {
72  int encodingToAdd = blockIdx.x * blockDim.x + threadIdx.x;
73 
74  if (encodingToAdd >= listIds.getSize(0)) {
75  return;
76  }
77 
78  int listId = listIds[encodingToAdd];
79  int offset = listOffset[encodingToAdd];
80 
81  // Add vector could be invalid (contains NaNs etc)
82  if (listId == -1 || offset == -1) {
83  return;
84  }
85 
86  auto encoding = encodings[encodingToAdd];
87  long index = indices[encodingToAdd];
88 
89  if (Opt == INDICES_32_BIT) {
90  // FIXME: there could be overflow here, but where should we check this?
91  ((int*) listIndices[listId])[offset] = (int) index;
92  } else if (Opt == INDICES_64_BIT) {
93  ((long*) listIndices[listId])[offset] = (long) index;
94  } else {
95  // INDICES_CPU or INDICES_IVF; no indices are being stored
96  }
97 
98  unsigned char* codeStart =
99  ((unsigned char*) listCodes[listId]) + offset * encodings.getSize(1);
100 
101  // FIXME: slow
102  for (int i = 0; i < encodings.getSize(1); ++i) {
103  codeStart[i] = (unsigned char) encoding[i];
104  }
105 }
106 
107 void
108 runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
109  Tensor<int, 1, true>& listOffset,
110  Tensor<int, 2, true>& encodings,
111  Tensor<long, 1, true>& indices,
112  thrust::device_vector<void*>& listCodes,
113  thrust::device_vector<void*>& listIndices,
114  IndicesOptions indicesOptions,
115  cudaStream_t stream) {
116  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
117  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
118 
119  dim3 grid(numBlocks);
120  dim3 block(numThreads);
121 
122 #define RUN_APPEND(IND) \
123  do { \
124  ivfpqInvertedListAppend<IND><<<grid, block, 0, stream>>>( \
125  listIds, listOffset, encodings, indices, \
126  listCodes.data().get(), \
127  listIndices.data().get()); \
128  } while (0)
129 
130  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) {
131  // no need to maintain indices on the GPU
132  RUN_APPEND(INDICES_IVF);
133  } else if (indicesOptions == INDICES_32_BIT) {
134  RUN_APPEND(INDICES_32_BIT);
135  } else if (indicesOptions == INDICES_64_BIT) {
136  RUN_APPEND(INDICES_64_BIT);
137  } else {
138  // unknown index storage type
139  FAISS_ASSERT(false);
140  }
141 
142  CUDA_TEST_ERROR();
143 
144 #undef RUN_APPEND
145 }
146 
147 template <IndicesOptions Opt, bool Exact, bool Float16>
148 __global__ void
149 ivfFlatInvertedListAppend(Tensor<int, 1, true> listIds,
150  Tensor<int, 1, true> listOffset,
151  Tensor<float, 2, true> vecs,
152  Tensor<long, 1, true> indices,
153  void** listData,
154  void** listIndices) {
155  int vec = blockIdx.x;
156 
157  int listId = listIds[vec];
158  int offset = listOffset[vec];
159 
160  // Add vector could be invalid (contains NaNs etc)
161  if (listId == -1 || offset == -1) {
162  return;
163  }
164 
165  if (threadIdx.x == 0) {
166  long index = indices[vec];
167 
168  if (Opt == INDICES_32_BIT) {
169  // FIXME: there could be overflow here, but where should we check this?
170  ((int*) listIndices[listId])[offset] = (int) index;
171  } else if (Opt == INDICES_64_BIT) {
172  ((long*) listIndices[listId])[offset] = (long) index;
173  } else {
174  // INDICES_CPU or INDICES_IVF; no indices are being stored
175  }
176  }
177 
178 #ifdef FAISS_USE_FLOAT16
179  // FIXME: should use half2 for better memory b/w
180  if (Float16) {
181  half* vecStart = ((half*) listData[listId]) + offset * vecs.getSize(1);
182 
183  if (Exact) {
184  vecStart[threadIdx.x] = __float2half(vecs[vec][threadIdx.x]);
185  } else {
186  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
187  vecStart[i] = __float2half(vecs[vec][i]);
188  }
189  }
190  }
191 #else
192  static_assert(!Float16, "float16 unsupported");
193 #endif
194 
195  if (!Float16) {
196  float* vecStart = ((float*) listData[listId]) + offset * vecs.getSize(1);
197 
198  if (Exact) {
199  vecStart[threadIdx.x] = vecs[vec][threadIdx.x];
200  } else {
201  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
202  vecStart[i] = vecs[vec][i];
203  }
204  }
205  }
206 }
207 
208 void
209 runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
210  Tensor<int, 1, true>& listOffset,
211  Tensor<float, 2, true>& vecs,
212  Tensor<long, 1, true>& indices,
213  bool useFloat16,
214  thrust::device_vector<void*>& listData,
215  thrust::device_vector<void*>& listIndices,
216  IndicesOptions indicesOptions,
217  cudaStream_t stream) {
218  int maxThreads = getMaxThreadsCurrentDevice();
219  bool exact = vecs.getSize(1) <= maxThreads;
220 
221  // Each block will handle appending a single vector
222  dim3 grid(vecs.getSize(0));
223  dim3 block(std::min(vecs.getSize(1), maxThreads));
224 
225 #define RUN_APPEND_OPT(OPT, EXACT, FLOAT16) \
226  do { \
227  ivfFlatInvertedListAppend<OPT, EXACT, FLOAT16> \
228  <<<grid, block, 0, stream>>>( \
229  listIds, listOffset, vecs, indices, \
230  listData.data().get(), \
231  listIndices.data().get()); \
232  } while (0) \
233 
234 #define RUN_APPEND(EXACT, FLOAT16) \
235  do { \
236  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) { \
237  /* no indices are maintained on the GPU */ \
238  RUN_APPEND_OPT(INDICES_IVF, EXACT, FLOAT16); \
239  } else if (indicesOptions == INDICES_32_BIT) { \
240  RUN_APPEND_OPT(INDICES_32_BIT, EXACT, FLOAT16); \
241  } else if (indicesOptions == INDICES_64_BIT) { \
242  RUN_APPEND_OPT(INDICES_64_BIT, EXACT, FLOAT16); \
243  } else { \
244  FAISS_ASSERT(false); \
245  } \
246  } while (0);
247 
248  if (useFloat16) {
249 #ifdef FAISS_USE_FLOAT16
250  if (exact) {
251  RUN_APPEND(true, true);
252  } else {
253  RUN_APPEND(false, true);
254  }
255 #else
256  // no float16 support
257  FAISS_ASSERT(false);
258 #endif
259  } else {
260  if (exact) {
261  RUN_APPEND(true, false);
262  } else {
263  RUN_APPEND(false, false);
264  }
265  }
266 
267  CUDA_TEST_ERROR();
268 
269 #undef RUN_APPEND
270 #undef RUN_APPEND_OPT
271 }
272 
273 } } // namespace