13 #include "../../FaissAssert.h" 
   15 #include "DeviceUtils.h" 
   20 namespace faiss { 
namespace gpu {
 
   22 template <
typename T, 
typename IndexT>
 
   24   static constexpr 
int kMaxDims = 8;
 
   27   IndexT sizes[kMaxDims];
 
   28   IndexT strides[kMaxDims];
 
   32 template <
typename T, 
typename IndexT, 
int Dim>
 
   39     for (
int i = Dim - 1; i >= 0; --i) {
 
   40       IndexT curDimIndex = linearId % info.sizes[i];
 
   41       IndexT curDimOffset = curDimIndex * info.strides[i];
 
   43       offset += curDimOffset;
 
   46         linearId /= info.sizes[i];
 
   54 template <
typename T, 
typename IndexT>
 
   62 template <
typename T, 
typename IndexT, 
int Dim>
 
   66   for (
int i = 0; i < Dim; ++i) {
 
   67     info.sizes[i] = (IndexT) t.
getSize(i);
 
   68     info.strides[i] = (IndexT) t.
getStride(i);
 
   77 template <
typename T, 
typename IndexT, 
int DimInput, 
int DimOutput>
 
   78 __global__ 
void transposeAny(TensorInfo<T, IndexT> input,
 
   79                              TensorInfo<T, IndexT> output,
 
   81   for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
 
   83        i += gridDim.x + blockDim.x) {
 
   84     auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
 
   85     auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
 
   87 #if __CUDA_ARCH__ >= 350 
   88     output.data[outputOffset] = __ldg(&input.data[inputOffset]);
 
   90     output.data[outputOffset] = input.data[inputOffset];
 
  104 template <
typename T, 
int Dim>
 
  105 void runTransposeAny(Tensor<T, Dim, true>& in,
 
  107                      Tensor<T, Dim, true>& out,
 
  108                      cudaStream_t stream) {
 
  109   static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
 
  110                 "too many dimensions");
 
  112   FAISS_ASSERT(dim1 != dim2);
 
  113   FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
 
  117   for (
int i = 0; i < Dim; ++i) {
 
  118     outSize[i] = in.getSize(i);
 
  121   std::swap(outSize[dim1], outSize[dim2]);
 
  123   for (
int i = 0; i < Dim; ++i) {
 
  124     FAISS_ASSERT(out.getSize(i) == outSize[i]);
 
  127   size_t totalSize = in.numElements();
 
  128   size_t block = std::min((
size_t) getMaxThreadsCurrentDevice(), totalSize);
 
  130   if (totalSize <= (
size_t) std::numeric_limits<int>::max()) {
 
  132     auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
 
  133     auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
 
  135     std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
 
  136     std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
 
  138     auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
 
  140     transposeAny<T, 
unsigned int, Dim, -1>
 
  141       <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
 
  143     auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
 
  144     auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
 
  146     std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
 
  147     std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
 
  149     auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
 
  151     transposeAny<T, 
unsigned long, Dim, -1>
 
  152       <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
 
__host__ __device__ IndexT getSize(int i) const 
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data. 
__host__ __device__ IndexT getStride(int i) const