Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
InvertedListAppend.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "InvertedListAppend.cuh"
13 #include "../../FaissAssert.h"
14 #include "../utils/Float16.cuh"
15 #include "../utils/DeviceUtils.h"
16 #include "../utils/Tensor.cuh"
17 #include "../utils/StaticUtils.h"
18 
19 namespace faiss { namespace gpu {
20 
21 __global__ void
22 runUpdateListPointers(Tensor<int, 1, true> listIds,
23  Tensor<int, 1, true> newListLength,
24  Tensor<void*, 1, true> newCodePointers,
25  Tensor<void*, 1, true> newIndexPointers,
26  int* listLengths,
27  void** listCodes,
28  void** listIndices) {
29  int index = blockIdx.x * blockDim.x + threadIdx.x;
30 
31  if (index >= listIds.getSize(0)) {
32  return;
33  }
34 
35  int listId = listIds[index];
36  listLengths[listId] = newListLength[index];
37  listCodes[listId] = newCodePointers[index];
38  listIndices[listId] = newIndexPointers[index];
39 }
40 
41 void
42 runUpdateListPointers(Tensor<int, 1, true>& listIds,
43  Tensor<int, 1, true>& newListLength,
44  Tensor<void*, 1, true>& newCodePointers,
45  Tensor<void*, 1, true>& newIndexPointers,
46  thrust::device_vector<int>& listLengths,
47  thrust::device_vector<void*>& listCodes,
48  thrust::device_vector<void*>& listIndices,
49  cudaStream_t stream) {
50  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
51  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
52 
53  dim3 grid(numBlocks);
54  dim3 block(numThreads);
55 
56  runUpdateListPointers<<<grid, block, 0, stream>>>(
57  listIds, newListLength, newCodePointers, newIndexPointers,
58  listLengths.data().get(),
59  listCodes.data().get(),
60  listIndices.data().get());
61 }
62 
63 template <IndicesOptions Opt>
64 __global__ void
65 ivfpqInvertedListAppend(Tensor<int, 1, true> listIds,
66  Tensor<int, 1, true> listOffset,
67  Tensor<int, 2, true> encodings,
68  Tensor<long, 1, true> indices,
69  void** listCodes,
70  void** listIndices) {
71  int encodingToAdd = blockIdx.x * blockDim.x + threadIdx.x;
72 
73  if (encodingToAdd >= listIds.getSize(0)) {
74  return;
75  }
76 
77  int listId = listIds[encodingToAdd];
78  int offset = listOffset[encodingToAdd];
79 
80  // Add vector could be invalid (contains NaNs etc)
81  if (listId == -1 || offset == -1) {
82  return;
83  }
84 
85  auto encoding = encodings[encodingToAdd];
86  long index = indices[encodingToAdd];
87 
88  if (Opt == INDICES_32_BIT) {
89  // FIXME: there could be overflow here, but where should we check this?
90  ((int*) listIndices[listId])[offset] = (int) index;
91  } else if (Opt == INDICES_64_BIT) {
92  ((long*) listIndices[listId])[offset] = (long) index;
93  } else {
94  // INDICES_CPU or INDICES_IVF; no indices are being stored
95  }
96 
97  unsigned char* codeStart =
98  ((unsigned char*) listCodes[listId]) + offset * encodings.getSize(1);
99 
100  // FIXME: slow
101  for (int i = 0; i < encodings.getSize(1); ++i) {
102  codeStart[i] = (unsigned char) encoding[i];
103  }
104 }
105 
106 void
107 runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
108  Tensor<int, 1, true>& listOffset,
109  Tensor<int, 2, true>& encodings,
110  Tensor<long, 1, true>& indices,
111  thrust::device_vector<void*>& listCodes,
112  thrust::device_vector<void*>& listIndices,
113  IndicesOptions indicesOptions,
114  cudaStream_t stream) {
115  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
116  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
117 
118  dim3 grid(numBlocks);
119  dim3 block(numThreads);
120 
121 #define RUN_APPEND(IND) \
122  do { \
123  ivfpqInvertedListAppend<IND><<<grid, block, 0, stream>>>( \
124  listIds, listOffset, encodings, indices, \
125  listCodes.data().get(), \
126  listIndices.data().get()); \
127  } while (0)
128 
129  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) {
130  // no need to maintain indices on the GPU
131  RUN_APPEND(INDICES_IVF);
132  } else if (indicesOptions == INDICES_32_BIT) {
133  RUN_APPEND(INDICES_32_BIT);
134  } else if (indicesOptions == INDICES_64_BIT) {
135  RUN_APPEND(INDICES_64_BIT);
136  } else {
137  // unknown index storage type
138  FAISS_ASSERT(false);
139  }
140 
141 #undef RUN_APPEND
142 }
143 
144 template <IndicesOptions Opt, bool Exact, bool Float16>
145 __global__ void
146 ivfFlatInvertedListAppend(Tensor<int, 1, true> listIds,
147  Tensor<int, 1, true> listOffset,
148  Tensor<float, 2, true> vecs,
149  Tensor<long, 1, true> indices,
150  void** listData,
151  void** listIndices) {
152  int vec = blockIdx.x;
153 
154  int listId = listIds[vec];
155  int offset = listOffset[vec];
156 
157  // Add vector could be invalid (contains NaNs etc)
158  if (listId == -1 || offset == -1) {
159  return;
160  }
161 
162  if (threadIdx.x == 0) {
163  long index = indices[vec];
164 
165  if (Opt == INDICES_32_BIT) {
166  // FIXME: there could be overflow here, but where should we check this?
167  ((int*) listIndices[listId])[offset] = (int) index;
168  } else if (Opt == INDICES_64_BIT) {
169  ((long*) listIndices[listId])[offset] = (long) index;
170  } else {
171  // INDICES_CPU or INDICES_IVF; no indices are being stored
172  }
173  }
174 
175 #ifdef FAISS_USE_FLOAT16
176  // FIXME: should use half2 for better memory b/w
177  if (Float16) {
178  half* vecStart = ((half*) listData[listId]) + offset * vecs.getSize(1);
179 
180  if (Exact) {
181  vecStart[threadIdx.x] = __float2half(vecs[vec][threadIdx.x]);
182  } else {
183  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
184  vecStart[i] = __float2half(vecs[vec][i]);
185  }
186  }
187  }
188 #else
189  static_assert(!Float16, "float16 unsupported");
190 #endif
191 
192  if (!Float16) {
193  float* vecStart = ((float*) listData[listId]) + offset * vecs.getSize(1);
194 
195  if (Exact) {
196  vecStart[threadIdx.x] = vecs[vec][threadIdx.x];
197  } else {
198  for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
199  vecStart[i] = vecs[vec][i];
200  }
201  }
202  }
203 }
204 
205 void
206 runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
207  Tensor<int, 1, true>& listOffset,
208  Tensor<float, 2, true>& vecs,
209  Tensor<long, 1, true>& indices,
210  bool useFloat16,
211  thrust::device_vector<void*>& listData,
212  thrust::device_vector<void*>& listIndices,
213  IndicesOptions indicesOptions,
214  cudaStream_t stream) {
215  int maxThreads = getMaxThreadsCurrentDevice();
216  bool exact = vecs.getSize(1) <= maxThreads;
217 
218  // Each block will handle appending a single vector
219  dim3 grid(vecs.getSize(0));
220  dim3 block(std::min(vecs.getSize(1), maxThreads));
221 
222 #define RUN_APPEND_OPT(OPT, EXACT, FLOAT16) \
223  do { \
224  ivfFlatInvertedListAppend<OPT, EXACT, FLOAT16> \
225  <<<grid, block, 0, stream>>>( \
226  listIds, listOffset, vecs, indices, \
227  listData.data().get(), \
228  listIndices.data().get()); \
229  } while (0) \
230 
231 #define RUN_APPEND(EXACT, FLOAT16) \
232  do { \
233  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) { \
234  /* no indices are maintained on the GPU */ \
235  RUN_APPEND_OPT(INDICES_IVF, EXACT, FLOAT16); \
236  } else if (indicesOptions == INDICES_32_BIT) { \
237  RUN_APPEND_OPT(INDICES_32_BIT, EXACT, FLOAT16); \
238  } else if (indicesOptions == INDICES_64_BIT) { \
239  RUN_APPEND_OPT(INDICES_64_BIT, EXACT, FLOAT16); \
240  } else { \
241  FAISS_ASSERT(false); \
242  } \
243  } while (0);
244 
245  if (useFloat16) {
246 #ifdef FAISS_USE_FLOAT16
247  if (exact) {
248  RUN_APPEND(true, true);
249  } else {
250  RUN_APPEND(false, true);
251  }
252 #else
253  // no float16 support
254  FAISS_ASSERT(false);
255 #endif
256  } else {
257  if (exact) {
258  RUN_APPEND(true, false);
259  } else {
260  RUN_APPEND(false, false);
261  }
262  }
263 
264 #undef RUN_APPEND
265 #undef RUN_APPEND_OPT
266 }
267 
268 } } // namespace