1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the CC-by-NC license found in the
* LICENSE file in the root directory of this source tree.
*/
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "Float16.cuh"
#include "Select.cuh"
namespace faiss { namespace gpu {
template <typename K,
typename IndexType,
bool Dir,
int NumWarpQ,
int NumThreadQ,
int ThreadsPerBlock>
__global__ void warpSelect(Tensor<K, 2, true> in,
Tensor<K, 2, true> outK,
Tensor<IndexType, 2, true> outV,
K initK,
IndexType initV,
int k) {
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
WarpSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, k);
int warpId = threadIdx.x / kWarpSize;
int row = blockIdx.x * kNumWarps + warpId;
if (row >= in.getSize(0)) {
return;
}
K* inStart = in[row].data();
int i = getLaneId();
// Whole warps must participate in the selection
int limit = utils::roundDown(in.getSize(1), kWarpSize);
for (; i < limit; i += kWarpSize) {
heap.add(inStart[i], (IndexType) i);
}
// Handle non-warp multiple remainder
if (i < in.getSize(1)) {
heap.addThreadQ(inStart[i], (IndexType) i);
}
heap.reduce();
heap.writeOut(outK[row].data(),
outV[row].data(), k);
}
void runWarpSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#ifdef FAISS_USE_FLOAT16
void runWarpSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#endif
} } // namespace