Revert "Support for running on arbitrary CUDA device. (#537)" (#608)

This reverts commit f0318794.

Revert "Support for running on arbitrary CUDA device. (#537)" (#608)
This reverts commit f0318794.
05feadff · Francisco Massa · GitHub · bd39d2c1 · 05feadff · 05feadff
Unverified Commit 05feadff authored Mar 26, 2019 by Francisco Massa Committed by GitHub Mar 26, 2019
5 changed files
--- a/README.md
+++ b/README.md
@@ -68,27 +68,6 @@ image = ...
 predictions = coco_demo.run_on_opencv_image(image)
 ```

-### Use it on an arbitrary GPU device
-For some cases, while multi-GPU devices are installed in a machine, a possible situation is that 
-we only have accesse to a specified GPU device (e.g. CUDA:1 or CUDA:2) for inference, testing or training. 
-Here, the repository currently supports two methods to control devices.
-
-#### 1. using CUDA_VISIBLE_DEVICES environment variable (Recommend)
-Here is an example for Mask R-CNN R-50 FPN quick on the second device (CUDA:1):
-```bash
-export CUDA_VISIBLE_DEVICES=1
-python tools/train_net.py --config-file=configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml
-```
-Now, the session will be totally loaded on the second GPU device (CUDA:1).
-
-#### 2. using MODEL.DEVICE flag
-In addition, the program could run on a sepcific GPU device by setting `MODEL.DEVICE` flag.
-```bash
-python tools/train_net.py --config-file=configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml MODEL.DEVICE cuda:1
-```
-Where, we add a `MODEL.DEVICE cuda:1` flag to configure the target device. 
-*Pay attention, there is still a small part of memory stored in `cuda:0` for some reasons.*
-
 ## Perform training on COCO dataset

 For the following examples to work, you need to first install `maskrcnn_benchmark`.

--- a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGuard.h>

 #include <THC/THC.h>
 #include <THC/THCAtomics.cuh>
@@ -264,8 +263,6 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");

-  at::cuda::CUDAGuard device_guard(input.device());
-
  auto num_rois = rois.size(0);
  auto channels = input.size(1);
  auto height = input.size(2);
@@ -314,7 +311,6 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
                                  const int sampling_ratio) {
  AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
-  at::cuda::CUDAGuard device_guard(grad.device());

  auto num_rois = rois.size(0);
  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());

--- a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGuard.h>

 #include <THC/THC.h>
 #include <THC/THCAtomics.cuh>
@@ -116,8 +115,6 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");

-  at::cuda::CUDAGuard device_guard(input.device());
-
  auto num_rois = rois.size(0);
  auto channels = input.size(1);
  auto height = input.size(2);
@@ -170,7 +167,6 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
  AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
  // TODO add more checks
-  at::cuda::CUDAGuard device_guard(grad.device());

  auto num_rois = rois.size(0);
  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());

--- a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
@@ -4,7 +4,6 @@
 // cyfu@cs.unc.edu
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGuard.h>

 #include <THC/THC.h>
 #include <THC/THCAtomics.cuh>
@@ -112,8 +111,6 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");

-  at::cuda::CUDAGuard device_guard(logits.device());
-
  const int num_samples = logits.size(0);
 	
  auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
@@ -160,8 +157,6 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
  const int num_samples = logits.size(0);
  AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
 	
-  at::cuda::CUDAGuard device_guard(logits.device());
-
  auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
  auto d_logits_size = num_samples * logits.size(1);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

--- a/maskrcnn_benchmark/csrc/cuda/nms.cu
+++ b/maskrcnn_benchmark/csrc/cuda/nms.cu
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAGuard.h>

 #include <THC/THC.h>
 #include <THC/THCDeviceUtils.cuh>
@@ -71,8 +70,6 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
  using scalar_t = float;
  AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
-  at::cuda::CUDAGuard device_guard(boxes.device());
-  
  auto scores = boxes.select(1, 4);
  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
  auto boxes_sorted = boxes.index_select(0, order_t);