Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
InvertedListAppend.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #include "InvertedListAppend.cuh"
11 #include "../../FaissAssert.h"
12 #include "../utils/Float16.cuh"
13 #include "../utils/DeviceUtils.h"
14 #include "../utils/Tensor.cuh"
15 #include "../utils/StaticUtils.h"
16 
17 namespace faiss { namespace gpu {
18 
19 __global__ void
20 runUpdateListPointers(Tensor<int, 1, true> listIds,
21  Tensor<int, 1, true> newListLength,
22  Tensor<void*, 1, true> newCodePointers,
23  Tensor<void*, 1, true> newIndexPointers,
24  int* listLengths,
25  void** listCodes,
26  void** listIndices) {
27  int index = blockIdx.x * blockDim.x + threadIdx.x;
28 
29  if (index >= listIds.getSize(0)) {
30  return;
31  }
32 
33  int listId = listIds[index];
34  listLengths[listId] = newListLength[index];
35  listCodes[listId] = newCodePointers[index];
36  listIndices[listId] = newIndexPointers[index];
37 }
38 
39 void
40 runUpdateListPointers(Tensor<int, 1, true>& listIds,
41  Tensor<int, 1, true>& newListLength,
42  Tensor<void*, 1, true>& newCodePointers,
43  Tensor<void*, 1, true>& newIndexPointers,
44  thrust::device_vector<int>& listLengths,
45  thrust::device_vector<void*>& listCodes,
46  thrust::device_vector<void*>& listIndices,
47  cudaStream_t stream) {
48  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
49  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
50 
51  dim3 grid(numBlocks);
52  dim3 block(numThreads);
53 
54  runUpdateListPointers<<<grid, block, 0, stream>>>(
55  listIds, newListLength, newCodePointers, newIndexPointers,
56  listLengths.data().get(),
57  listCodes.data().get(),
58  listIndices.data().get());
59 
60  CUDA_TEST_ERROR();
61 }
62 
63 template <IndicesOptions Opt>
64 __global__ void
65 ivfpqInvertedListAppend(Tensor<int, 1, true> listIds,
66  Tensor<int, 1, true> listOffset,
67  Tensor<int, 2, true> encodings,
68  Tensor<long, 1, true> indices,
69  void** listCodes,
70  void** listIndices) {
71  int encodingToAdd = blockIdx.x * blockDim.x + threadIdx.x;
72 
73  if (encodingToAdd >= listIds.getSize(0)) {
74  return;
75  }
76 
77  int listId = listIds[encodingToAdd];
78  int offset = listOffset[encodingToAdd];
79 
80  // Add vector could be invalid (contains NaNs etc)
81  if (listId == -1 || offset == -1) {
82  return;
83  }
84 
85  auto encoding = encodings[encodingToAdd];
86  long index = indices[encodingToAdd];
87 
88  if (Opt == INDICES_32_BIT) {
89  // FIXME: there could be overflow here, but where should we check this?
90  ((int*) listIndices[listId])[offset] = (int) index;
91  } else if (Opt == INDICES_64_BIT) {
92  ((long*) listIndices[listId])[offset] = (long) index;
93  } else {
94  // INDICES_CPU or INDICES_IVF; no indices are being stored
95  }
96 
97  unsigned char* codeStart =
98  ((unsigned char*) listCodes[listId]) + offset * encodings.getSize(1);
99 
100  // FIXME: slow
101  for (int i = 0; i < encodings.getSize(1); ++i) {
102  codeStart[i] = (unsigned char) encoding[i];
103  }
104 }
105 
106 void
107 runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
108  Tensor<int, 1, true>& listOffset,
109  Tensor<int, 2, true>& encodings,
110  Tensor<long, 1, true>& indices,
111  thrust::device_vector<void*>& listCodes,
112  thrust::device_vector<void*>& listIndices,
113  IndicesOptions indicesOptions,
114  cudaStream_t stream) {
115  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
116  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
117 
118  dim3 grid(numBlocks);
119  dim3 block(numThreads);
120 
121 #define RUN_APPEND(IND) \
122  do { \
123  ivfpqInvertedListAppend<IND><<<grid, block, 0, stream>>>( \
124  listIds, listOffset, encodings, indices, \
125  listCodes.data().get(), \
126  listIndices.data().get()); \
127  } while (0)
128 
129  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) {
130  // no need to maintain indices on the GPU
131  RUN_APPEND(INDICES_IVF);
132  } else if (indicesOptions == INDICES_32_BIT) {
133  RUN_APPEND(INDICES_32_BIT);
134  } else if (indicesOptions == INDICES_64_BIT) {
135  RUN_APPEND(INDICES_64_BIT);
136  } else {
137  // unknown index storage type
138  FAISS_ASSERT(false);
139  }
140 
141  CUDA_TEST_ERROR();
142 
143 #undef RUN_APPEND
144 }
145 
146 template <IndicesOptions Opt, bool Exact, bool Float16>
147 __global__ void
148 ivfFlatInvertedListAppend(Tensor<int, 1, true> listIds,
149  Tensor<int, 1, true> listOffset,
150  Tensor<float, 2, true> vecs,
151  Tensor<long, 1, true> indices,
152  void** listData,
153  void** listIndices) {
154  int vec = blockIdx.x;
155 
156  int listId = listIds[vec];
157  int offset = listOffset[vec];
158 
159  // Add vector could be invalid (contains NaNs etc)
160  if (listId == -1 || offset == -1) {
161  return;
162  }
163 
164  if (threadIdx.x == 0) {
165  long index = indices[vec];
166 
167  if (Opt == INDICES_32_BIT) {
168  // FIXME: there could be overflow here, but where should we check this?
169  ((int*) listIndices[listId])[offset] = (int) index;
170  } else if (Opt == INDICES_64_BIT) {
171  ((long*) listIndices[listId])[offset] = (long) index;
172  } else {
173  // INDICES_CPU or INDICES_IVF; no indices are being stored
174  }
175  }
176 
177 #ifdef FAISS_USE_FLOAT16
178  // FIXME: should use half2 for better memory b/w
179  if (Float16) {
180  half* vecStart = ((half*) listData[listId]) + offset * vecs.getSize(1);
181 
182  if (Exact) {
183  vecStart[threadIdx.x] = __float2half(vecs[vec][threadIdx.x]);
184  } else {
185  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
186  vecStart[i] = __float2half(vecs[vec][i]);
187  }
188  }
189  }
190 #else
191  static_assert(!Float16, "float16 unsupported");
192 #endif
193 
194  if (!Float16) {
195  float* vecStart = ((float*) listData[listId]) + offset * vecs.getSize(1);
196 
197  if (Exact) {
198  vecStart[threadIdx.x] = vecs[vec][threadIdx.x];
199  } else {
200  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
201  vecStart[i] = vecs[vec][i];
202  }
203  }
204  }
205 }
206 
207 void
208 runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
209  Tensor<int, 1, true>& listOffset,
210  Tensor<float, 2, true>& vecs,
211  Tensor<long, 1, true>& indices,
212  bool useFloat16,
213  thrust::device_vector<void*>& listData,
214  thrust::device_vector<void*>& listIndices,
215  IndicesOptions indicesOptions,
216  cudaStream_t stream) {
217  int maxThreads = getMaxThreadsCurrentDevice();
218  bool exact = vecs.getSize(1) <= maxThreads;
219 
220  // Each block will handle appending a single vector
221  dim3 grid(vecs.getSize(0));
222  dim3 block(std::min(vecs.getSize(1), maxThreads));
223 
224 #define RUN_APPEND_OPT(OPT, EXACT, FLOAT16) \
225  do { \
226  ivfFlatInvertedListAppend<OPT, EXACT, FLOAT16> \
227  <<<grid, block, 0, stream>>>( \
228  listIds, listOffset, vecs, indices, \
229  listData.data().get(), \
230  listIndices.data().get()); \
231  } while (0) \
232 
233 #define RUN_APPEND(EXACT, FLOAT16) \
234  do { \
235  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) { \
236  /* no indices are maintained on the GPU */ \
237  RUN_APPEND_OPT(INDICES_IVF, EXACT, FLOAT16); \
238  } else if (indicesOptions == INDICES_32_BIT) { \
239  RUN_APPEND_OPT(INDICES_32_BIT, EXACT, FLOAT16); \
240  } else if (indicesOptions == INDICES_64_BIT) { \
241  RUN_APPEND_OPT(INDICES_64_BIT, EXACT, FLOAT16); \
242  } else { \
243  FAISS_ASSERT(false); \
244  } \
245  } while (0);
246 
247  if (useFloat16) {
248 #ifdef FAISS_USE_FLOAT16
249  if (exact) {
250  RUN_APPEND(true, true);
251  } else {
252  RUN_APPEND(false, true);
253  }
254 #else
255  // no float16 support
256  FAISS_ASSERT(false);
257 #endif
258  } else {
259  if (exact) {
260  RUN_APPEND(true, false);
261  } else {
262  RUN_APPEND(false, false);
263  }
264  }
265 
266  CUDA_TEST_ERROR();
267 
268 #undef RUN_APPEND
269 #undef RUN_APPEND_OPT
270 }
271 
272 } } // namespace