1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD+Patents license found in the
* LICENSE file in the root directory of this source tree.
*/
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include <cuda.h>
#include "../GpuResources.h"
#include "DeviceTensor.cuh"
// For float16, We use the half datatype, expecting it to be a struct
// as in CUDA 7.5.
#if CUDA_VERSION >= 7050
#define FAISS_USE_FLOAT16 1
// Some compute capabilities have full float16 ALUs.
#if __CUDA_ARCH__ >= 530
#define FAISS_USE_FULL_FLOAT16 1
#endif // __CUDA_ARCH__ types
#endif // CUDA_VERSION
#ifdef FAISS_USE_FLOAT16
#include <cuda_fp16.h>
#endif
namespace faiss { namespace gpu {
#ifdef FAISS_USE_FLOAT16
// 64 bytes containing 4 half (float16) values
struct Half4 {
half2 a;
half2 b;
};
inline __device__ float4 half4ToFloat4(Half4 v) {
float2 a = __half22float2(v.a);
float2 b = __half22float2(v.b);
float4 out;
out.x = a.x;
out.y = a.y;
out.z = b.x;
out.w = b.y;
return out;
}
inline __device__ Half4 float4ToHalf4(float4 v) {
float2 a;
a.x = v.x;
a.y = v.y;
float2 b;
b.x = v.z;
b.y = v.w;
Half4 out;
out.a = __float22half2_rn(a);
out.b = __float22half2_rn(b);
return out;
}
// 128 bytes containing 8 half (float16) values
struct Half8 {
Half4 a;
Half4 b;
};
/// Returns true if the given device supports native float16 math
bool getDeviceSupportsFloat16Math(int device);
/// Copies `in` to `out` while performing a float32 -> float16 conversion
void runConvertToFloat16(half* out,
const float* in,
size_t num,
cudaStream_t stream);
/// Copies `in` to `out` while performing a float16 -> float32
/// conversion
void runConvertToFloat32(float* out,
const half* in,
size_t num,
cudaStream_t stream);
template <int Dim>
void toHalf(cudaStream_t stream,
Tensor<float, Dim, true>& in,
Tensor<half, Dim, true>& out) {
FAISS_ASSERT(in.numElements() == out.numElements());
// The memory is contiguous (the `true`), so apply a pointwise
// kernel to convert
runConvertToFloat16(out.data(), in.data(), in.numElements(), stream);
}
template <int Dim>
DeviceTensor<half, Dim, true> toHalf(GpuResources* resources,
cudaStream_t stream,
Tensor<float, Dim, true>& in) {
DeviceTensor<half, Dim, true> out;
if (resources) {
out = std::move(DeviceTensor<half, Dim, true>(
resources->getMemoryManagerCurrentDevice(),
in.sizes(),
stream));
} else {
out = std::move(DeviceTensor<half, Dim, true>(in.sizes()));
}
toHalf<Dim>(stream, in, out);
return out;
}
template <int Dim>
void fromHalf(cudaStream_t stream,
Tensor<half, Dim, true>& in,
Tensor<float, Dim, true>& out) {
FAISS_ASSERT(in.numElements() == out.numElements());
// The memory is contiguous (the `true`), so apply a pointwise
// kernel to convert
runConvertToFloat32(out.data(), in.data(), in.numElements(), stream);
}
template <int Dim>
DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
cudaStream_t stream,
Tensor<half, Dim, true>& in) {
DeviceTensor<float, Dim, true> out;
if (resources) {
out = std::move(DeviceTensor<float, Dim, true>(
resources->getMemoryManagerCurrentDevice(),
in.sizes(),
stream));
} else {
out = std::move(DeviceTensor<float, Dim, true>(in.sizes()));
}
fromHalf<Dim>(stream, in, out);
return out;
}
__half hostFloat2Half(float v);
#endif // FAISS_USE_FLOAT16
} } // namespace