13 #include "Float16.cuh" 
   16 namespace faiss { 
namespace gpu {
 
   24 __global__ 
void blockSelect(Tensor<K, 2, true> in,
 
   25                             Tensor<K, 2, true> outK,
 
   26                             Tensor<IndexType, 2, true> outV,
 
   30   constexpr 
int kNumWarps = ThreadsPerBlock / kWarpSize;
 
   32   __shared__ K smemK[kNumWarps * NumWarpQ];
 
   33   __shared__ IndexType smemV[kNumWarps * NumWarpQ];
 
   35   BlockSelect<K, IndexType, Dir, Comparator<K>,
 
   36             NumWarpQ, NumThreadQ, ThreadsPerBlock>
 
   37     heap(initK, initV, smemK, smemV, k);
 
   42   int limit = utils::roundDown(in.getSize(1), kWarpSize);
 
   45   for (; i < limit; i += blockDim.x) {
 
   46     heap.add(in[row][i], (IndexType) i);
 
   50   if (i < in.getSize(1)) {
 
   51     heap.addThreadQ(in[row][i], (IndexType) i);
 
   56   for (
int i = threadIdx.x; i < k; i += blockDim.x) {
 
   57     outK[row][i] = smemK[i];
 
   58     outV[row][i] = smemV[i];
 
   62 void runBlockSelect(Tensor<float, 2, true>& in,
 
   63                   Tensor<float, 2, true>& outKeys,
 
   64                   Tensor<int, 2, true>& outIndices,
 
   65                   bool dir, 
int k, cudaStream_t stream);
 
   67 #ifdef FAISS_USE_FLOAT16 
   68 void runBlockSelect(Tensor<half, 2, true>& in,
 
   69                   Tensor<half, 2, true>& outKeys,
 
   70                   Tensor<int, 2, true>& outIndices,
 
   71                   bool dir, 
int k, cudaStream_t stream);