Commit df2edbe3 authored by Matthijs Douze's avatar Matthijs Douze

fix issue 320

parent 4fe92046
...@@ -47,7 +47,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input, ...@@ -47,7 +47,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input,
if (endRow) { if (endRow) {
for (int row = rowStart; row < output.getSize(0); ++row) { for (int row = rowStart; row < output.getSize(0); ++row) {
T out = output[row][col].ldg(); T out = output[row][col];
out = Math<T>::add(out, val); out = Math<T>::add(out, val);
output[row][col] = out; output[row][col] = out;
} }
...@@ -57,7 +57,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input, ...@@ -57,7 +57,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input,
for (int row = rowStart; row < rowEnd; row += kRowUnroll) { for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
#pragma unroll #pragma unroll
for (int i = 0; i < kRowUnroll; ++i) { for (int i = 0; i < kRowUnroll; ++i) {
rows[i] = output[row + i][col].ldg(); rows[i] = output[row + i][col];
} }
#pragma unroll #pragma unroll
...@@ -86,7 +86,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input, ...@@ -86,7 +86,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input,
for (int row = rowStart; row < output.getSize(0); ++row) { for (int row = rowStart; row < output.getSize(0); ++row) {
#pragma unroll #pragma unroll
for (int i = 0; i < kColLoad; ++i) { for (int i = 0; i < kColLoad; ++i) {
T out = output[row][col + i * blockDim.x].ldg(); T out = output[row][col + i * blockDim.x];
out = Math<T>::add(out, val[i]); out = Math<T>::add(out, val[i]);
output[row][col + i * blockDim.x] = out; output[row][col + i * blockDim.x] = out;
} }
...@@ -100,7 +100,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input, ...@@ -100,7 +100,7 @@ __global__ void sumAlongColumns(Tensor<T, 1, true> input,
#pragma unroll #pragma unroll
for (int j = 0; j < kColLoad; ++j) { for (int j = 0; j < kColLoad; ++j) {
rows[i * kColLoad + j] = rows[i * kColLoad + j] =
output[row + i][col + j * blockDim.x].ldg(); output[row + i][col + j * blockDim.x];
} }
} }
......
...@@ -310,6 +310,11 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canCastResize() const { ...@@ -310,6 +310,11 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canCastResize() const {
static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes"); static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
constexpr int kMultiple = sizeof(U) / sizeof(T); constexpr int kMultiple = sizeof(U) / sizeof(T);
// Ensure that the base pointer is sizeof(U) aligned
if (((uintptr_t) data_) % sizeof(U) != 0) {
return false;
}
// Check all outer strides // Check all outer strides
for (int i = 0; i < Dim - 1; ++i) { for (int i = 0; i < Dim - 1; ++i) {
if (stride_[i] % kMultiple != 0) { if (stride_[i] % kMultiple != 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment