From 78fe149ab9ad12b8fb57e6127787c79c3cb2a281 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Wed, 26 Sep 2018 11:29:48 -0700
Subject: [PATCH 01/82] Fix ONNX bug, add symbolic for full

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12052

Differential Revision: D10044910

Pulled By: apaszke

fbshipit-source-id: 015ef372966d7594e1b450e348d457429f6ef20d
---
 .../expect/TestOperators.test_full.expect     | 148 ++++++++++++++++++
 test/onnx/test_operators.py                   |   4 +
 torch/csrc/jit/passes/onnx/peephole.cpp       |   2 +-
 torch/onnx/symbolic.py                        |  13 +-
 4 files changed, 165 insertions(+), 2 deletions(-)
 create mode 100644 test/onnx/expect/TestOperators.test_full.expect

diff --git a/test/onnx/expect/TestOperators.test_full.expect b/test/onnx/expect/TestOperators.test_full.expect
new file mode 100644
index 00000000000000..db975329ffa7b5
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_full.expect
@@ -0,0 +1,148 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    output: "1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: INT64
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    output: "2"
+    op_type: "Shape"
+  }
+  node {
+    input: "2"
+    input: "1"
+    output: "3"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: INT64
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    output: "5"
+    op_type: "Shape"
+  }
+  node {
+    input: "5"
+    input: "4"
+    output: "6"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "3"
+    output: "7"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "6"
+    output: "8"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "7"
+    input: "8"
+    output: "9"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "9"
+    output: "10"
+    op_type: "ConstantFill"
+    attribute {
+      name: "dtype"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "input_as_shape"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "value"
+      f: 2
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "10"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 9
+}
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 2dfdd409a15ce7..1e2d0ffb294219 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -287,6 +287,10 @@ def test_hardtanh(self):
         x = Variable(torch.randn(3, 4), requires_grad=True)
         self.assertONNX(lambda x: torch.nn.Hardtanh(-0.5, 0.5)(x), x)
 
+    def test_full(self):
+        x = torch.randn(3, 4, requires_grad=True)
+        self.assertONNX(lambda x: torch.full(x.shape, 2), x)
+
     def test_max(self):
         x = Variable(torch.randn(3, 4), requires_grad=True)
         y = Variable(torch.randn(3, 4), requires_grad=True)
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 8045a46a4af1ba..6e90780ecbf695 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -159,7 +159,7 @@ void eliminateNopTranspose(Block *b) {
     }
     if (n->kind() == onnx::Transpose) {
       if (isNopTranspose(n->is(attr::perm))) {
-        n->replaceAllUsesWith(n->input()->node());
+        n->output()->replaceAllUsesWith(n->input());
         it.destroyCurrent();
         continue;
       }
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index e1468637ba4ff2..91de860e2c68ee 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -975,13 +975,24 @@ def zeros_like(g, input):
 ]
 
 
-@parse_args('v', 'i', 'i', 'v')
+@parse_args('v', 'i', 'v', 'v')
 def zeros(g, shape, scalar_type, layout, device):
     # NOTE: no way to set device in ONNX, so we ignore it
     return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type],
                 input_as_shape_i=1, value_f=0)
 
 
+def full(g, shape, value, scalar_type, layout, device):
+    const_value = _maybe_get_const(value, 't')
+    if _is_value(const_value):
+        tmp = zeros(shape, scalar_type, layout, device)
+        return add(tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        scalar_type = _get_const(scalar_type, 'i', 'dtype')
+        return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type],
+                    input_as_shape_i=1, value_f=const_value)
+
+
 def full_like(g, input, fill_value):
     # TODO: a more efficient implementation (ConstantFill?)
     return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1)))

From c2f8f5076c5824890fc076ac05bfb00abd72b53f Mon Sep 17 00:00:00 2001
From: Doug Friedman <dfriedm3@gmail.com>
Date: Wed, 26 Sep 2018 12:12:12 -0700
Subject: [PATCH 02/82] add narrow() support for sparse tensors re: #8853
 (#11342)

Summary:
Couple questions:

1) I used the log1p implementation in #8969 as a guide especially for testing.  I'm not sure what the ```skipIfROCM``` annotation is for, so unsure if i need it for my test.

2) I implemented the branching logic in the narrow function itself; is this the right place to do so?  I noticed that there a number of places where sparse-specific logic is handled with just an if statement in this file.  Or should I implement a separate dispatch in native_functions.yml as in the log1p?

And of course, happy to make any any other updates/changes that I may have missed as well.  This is my first PR to the project.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11342

Differential Revision: D9978430

Pulled By: weiyangfb

fbshipit-source-id: e73dc20302ab58925afb19e609e31f4a38c634ad
---
 aten/src/ATen/core/Tensor.h                |  1 +
 aten/src/ATen/core/TensorMethods.h         |  3 ++
 aten/src/ATen/core/Type.h                  |  1 +
 aten/src/ATen/native/TensorShape.cpp       | 39 ++++++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  8 +++++
 docs/source/sparse.rst                     |  1 +
 test/test_sparse.py                        | 27 +++++++++++++++
 torch/_tensor_docs.py                      | 11 ++++++
 8 files changed, 91 insertions(+)

diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index fa31741313db39..31de431bf367b0 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -533,6 +533,7 @@ class CAFFE2_API Tensor {
   Tensor mv(const Tensor & vec) const;
   Tensor mvlgamma(int64_t p) const;
   Tensor & mvlgamma_(int64_t p);
+  Tensor narrow_copy(int64_t dim, int64_t start, int64_t length) const;
   Tensor narrow(int64_t dim, int64_t start, int64_t length) const;
   Tensor permute(IntList dims) const;
   Tensor pin_memory() const;
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index c6197b4fc2d08b..857131298376b1 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -902,6 +902,9 @@ inline Tensor Tensor::mvlgamma(int64_t p) const {
 inline Tensor & Tensor::mvlgamma_(int64_t p) {
     return type().mvlgamma_(*this, p);
 }
+inline Tensor Tensor::narrow_copy(int64_t dim, int64_t start, int64_t length) const {
+    return type().narrow_copy(*this, dim, start, length);
+}
 inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const {
     return type().narrow(*this, dim, start, length);
 }
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index 3a2ccbe1e45edb..009ee309d7808a 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -492,6 +492,7 @@ struct CAFFE2_API Type {
   virtual Tensor mv(const Tensor & self, const Tensor & vec) const = 0;
   virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0;
   virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0;
+  virtual Tensor narrow_copy(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
   virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
   virtual Tensor permute(const Tensor & self, IntList dims) const = 0;
   virtual Tensor pin_memory(const Tensor & self) const = 0;
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index c470f554c14234..31b8f59a779a65 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -148,6 +148,45 @@ Tensor &as_strided_(Tensor& self, IntList size, IntList stride) {
   return at::as_strided_(self, size, stride, self.storage_offset());
 }
 
+Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length){
+  int64_t allDim = self.dim();
+  int64_t end = start+length;
+  AT_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  AT_CHECK(dim >= 0 && dim < allDim, 
+    "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, ".");
+  AT_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
+    "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").")
+  LongTensor indices = self._indices();
+  int64_t sparseDims = self._sparseDims();
+  
+  std::vector<int64_t> newSizes = self.sizes().vec();
+  newSizes[dim]=length;
+  
+  Tensor newValues;
+  LongTensor newIndices;
+  if(dim < sparseDims){
+    Tensor mask = (indices[dim] >= start).__and__((indices[dim] < end));
+    newIndices = indices.masked_select(mask).view({sparseDims, -1});
+    newIndices[dim].add_(-start);
+    Tensor nzIndices = mask.nonzero().view(-1);
+    newValues = self._values().index_select(0, nzIndices);
+  }else{
+    /* This means we are narrowing on a dense dim, which is in effect just a
+        regular narrow on _values() */
+    newIndices = indices;
+    int64_t ddim = dim - sparseDims + 1;
+    newValues = self._values().narrow_copy(ddim, start, length);
+  }
+
+  SparseTensor newTensor = at::sparse_coo_tensor(newIndices, newValues, newSizes, self.type().options());
+  _get_sparse_impl(newTensor)->set_coalesced(self.is_coalesced());
+  return newTensor;
+}
+
+Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){
+    return self.narrow(dim, start, length).clone();
+}
+
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   AT_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
   auto cur_size = self.size(dim);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2cc0995dabadad..9a7c56c3a499f1 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1174,6 +1174,14 @@
 - func: mvlgamma_(Tensor self, int64_t p) -> Tensor
   variants: method
 
+- func: narrow_copy(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
+  variants: method
+  dispatch:
+    CPU: narrow_copy_dense
+    CUDA: narrow_copy_dense
+    SparseCPU: narrow_copy_sparse
+    SparseCUDA: narrow_copy_sparse
+
 - func: narrow(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
   variants: function, method
 
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index a329bb049baac3..71dcaa7511fa26 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -110,6 +110,7 @@ An empty sparse tensor can be constructed by specifying its size:
     .. method:: mm
     .. method:: mul
     .. method:: mul_
+    .. method:: narrow_copy
     .. method:: resizeAs_
     .. method:: size
     .. method:: spadd
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 1304f42bda78fa..0e91dca37d4c3f 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1023,6 +1023,33 @@ def test_shape(i_shapes, v_shapes, nnzs):
         test_shape([0, 3, 4], [3, 4, 5, 6], [0])
         test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
 
+    def _test_narrow(self, input, narrow_args):
+        expected = input.to_dense().narrow(*narrow_args)
+        self.assertEqual(expected, input.narrow_copy(*narrow_args).to_dense())
+
+    def _all_narrow_combs(self, shape):
+        for dim, dim_sz in enumerate(shape):
+            for start in range(dim_sz):
+                for length in range(dim_sz - start):
+                    yield [dim, start, length]
+
+    def test_narrow(self):
+        shape = [3, 3, 4, 2]
+        input, _, _ = self._gen_sparse(4, 19, shape)
+        for narrow_args in self._all_narrow_combs(shape):
+            self._test_narrow(input, narrow_args)
+
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(-1, 0, 3))  # dim < 0
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(10, 0, 3))  # dim > input.dim()
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(0, shape[0] + 1, 3))  # start > size of dim
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(0, 2, shape[0]))  # start+length > size of dim
+
+        with_dense, _, _ = self._gen_sparse(2, 7, shape)
+        for narrow_args in self._all_narrow_combs(shape):
+            self._test_narrow(with_dense, narrow_args)
+
+        self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3))  # dim > sparseDim + denseDim
+
     def _test_log1p_tensor(self, input, dense_tensor):
         expected_output = torch.tensor(dense_tensor).log1p_()
         self.assertEqual(expected_output, input.log1p().to_dense())
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index f5c7d41c199e0b..832de8d76db4b0 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1453,6 +1453,17 @@ def callable(a, b) -> number
             [ 8,  9]])
 """)
 
+add_docstr_all('narrow_copy',
+               r"""
+narrow_copy(dimension, start, length) -> Tensor
+
+Same as :meth:`Tensor.narrow` except returning a copy rather
+than shared storage.  This is primarily for sparse tensors, which
+do not have a shared-storage narrow method.  Calling ```narrow_copy``
+with ```dimemsion > self._sparseDims()``` will return a copy with the
+relevant dense dimension narrowed, and ```self.shape``` updated accordingly.
+""")
+
 add_docstr_all('ndimension',
                r"""
 ndimension() -> int

From d9c27f4d8d2e71a5dd3b3c3f82790d14a296124f Mon Sep 17 00:00:00 2001
From: Dong Shi <jdshi@fb.com>
Date: Wed, 26 Sep 2018 12:34:13 -0700
Subject: [PATCH 03/82] T33898723: Simple put operators for caffe2 stats
 (#12057)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12057

Add simple put operators for various types of stats

Reviewed By: mlappelbaum

Differential Revision: D9925268

fbshipit-source-id: cec02b0027d2d0ef3d35741be4b02c429d492810
---
 aten/src/ATen/core/Half-inl.h                 |  27 +++++
 caffe2/operators/stats_put_ops.cc             |  92 ++++++++++++++++
 caffe2/operators/stats_put_ops.h              |  53 +++++++++
 .../operator_test/stats_put_ops_test.py       | 102 ++++++++++++++++++
 4 files changed, 274 insertions(+)
 create mode 100644 caffe2/operators/stats_put_ops.cc
 create mode 100644 caffe2/operators/stats_put_ops.h
 create mode 100644 caffe2/python/operator_test/stats_put_ops_test.py

diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
index 75ff2a2fe6937f..f964649e19f172 100644
--- a/aten/src/ATen/core/Half-inl.h
+++ b/aten/src/ATen/core/Half-inl.h
@@ -190,6 +190,33 @@ inline AT_HOST_DEVICE Half operator/(int a, Half b) {
   return static_cast<Half>(a) / b;
 }
 
+//// Arithmetic with longs
+inline AT_HOST_DEVICE Half operator+(Half a, long b) {
+  return a + static_cast<Half>(b);
+}
+inline AT_HOST_DEVICE Half operator-(Half a, long b) {
+  return a - static_cast<Half>(b);
+}
+inline AT_HOST_DEVICE Half operator*(Half a, long b) {
+  return a * static_cast<Half>(b);
+}
+inline AT_HOST_DEVICE Half operator/(Half a, long b) {
+  return a / static_cast<Half>(b);
+}
+
+inline AT_HOST_DEVICE Half operator+(long a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline AT_HOST_DEVICE Half operator-(long a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline AT_HOST_DEVICE Half operator*(long a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline AT_HOST_DEVICE Half operator/(long a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
 /// conversion from at::Half to float.
 
diff --git a/caffe2/operators/stats_put_ops.cc b/caffe2/operators/stats_put_ops.cc
new file mode 100644
index 00000000000000..40c6b8cc60d085
--- /dev/null
+++ b/caffe2/operators/stats_put_ops.cc
@@ -0,0 +1,92 @@
+#include "caffe2/operators/stats_put_ops.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+#define REGISTER_TEMPLATED_STAT_PUT_OP(OP_NAME, STAT_NAME, STAT_MACRO) \
+  struct STAT_NAME {                                                   \
+    CAFFE_STAT_CTOR(STAT_NAME);                                        \
+    STAT_MACRO(stat_value);                                            \
+  };                                                                   \
+  REGISTER_CPU_OPERATOR(OP_NAME, TemplatePutOp<STAT_NAME>);
+
+REGISTER_TEMPLATED_STAT_PUT_OP(
+    AveragePut,
+    AveragePutStat,
+    CAFFE_AVG_EXPORTED_STAT)
+
+OPERATOR_SCHEMA(AveragePut)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .Arg(
+        "name",
+        "(*str*): name of the stat. If not present, then uses name of input blob")
+    .Arg(
+        "magnitude_expand",
+        "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .SetDoc(R"DOC(
+    Consume a value and pushes it to the global stat registry as an average.
+
+    Github Links:
+    - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc
+
+        )DOC")
+    .Input(
+        0,
+        "value",
+        "(*Tensor`<number>`*): A scalar tensor, representing any numeric value");
+
+REGISTER_TEMPLATED_STAT_PUT_OP(
+    IncrementPut,
+    IncrementPutStat,
+    CAFFE_EXPORTED_STAT)
+
+OPERATOR_SCHEMA(IncrementPut)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .Arg(
+        "name",
+        "(*str*): name of the stat. If not present, then uses name of input blob")
+    .Arg(
+        "magnitude_expand",
+        "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .SetDoc(R"DOC(
+    Consume a value and pushes it to the global stat registry as an sum.
+
+    Github Links:
+    - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc
+
+        )DOC")
+    .Input(
+        0,
+        "value",
+        "(*Tensor`<number>`*): A scalar tensor, representing any numeric value");
+
+REGISTER_TEMPLATED_STAT_PUT_OP(
+    StdDevPut,
+    StdDevPutStat,
+    CAFFE_STDDEV_EXPORTED_STAT)
+
+OPERATOR_SCHEMA(StdDevPut)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .Arg(
+        "name",
+        "(*str*): name of the stat. If not present, then uses name of input blob")
+    .Arg(
+        "magnitude_expand",
+        "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .SetDoc(R"DOC(
+      Consume a value and pushes it to the global stat registry as an standard deviation.
+
+      Github Links:
+      - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc
+
+        )DOC")
+    .Input(
+        0,
+        "value",
+        "(*Tensor`<number>`*): A scalar tensor, representing any numeric value");
+
+} // namespace caffe2
diff --git a/caffe2/operators/stats_put_ops.h b/caffe2/operators/stats_put_ops.h
new file mode 100644
index 00000000000000..659df219809d34
--- /dev/null
+++ b/caffe2/operators/stats_put_ops.h
@@ -0,0 +1,53 @@
+#include <limits>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+template <typename T>
+struct TemplatePutOp : public Operator<CPUContext> {
+  TemplatePutOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        given_name_(GetSingleArgument<std::string>(
+            "stat_name",
+            operator_def.input().Get(0))),
+        magnitude_expand_(GetSingleArgument<int64_t>("magnitude_expand", 1)),
+        stat_(given_name_) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<
+        int,
+        float,
+        uint8_t,
+        int8_t,
+        uint16_t,
+        int16_t,
+        int64_t,
+        at::Half,
+        double>>::call(this, Input(0));
+  }
+
+  template <typename V>
+  bool DoRunWithType() {
+    auto input = *Input(0).template data<V>();
+
+    CAFFE_ENFORCE(
+        static_cast<int64_t>(input + 1) <
+            std::numeric_limits<int64_t>::max() / magnitude_expand_,
+        "Input value is too large for the given magnitude expansion!");
+
+    int64_t int_value = input * magnitude_expand_;
+
+    CAFFE_EVENT(stat_, stat_value, int_value);
+
+    return true;
+  }
+
+ private:
+  const std::string given_name_;
+  const long magnitude_expand_;
+  T stat_;
+};
+} // namespace caffe2
diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py
new file mode 100644
index 00000000000000..d3757c3b396e50
--- /dev/null
+++ b/caffe2/python/operator_test/stats_put_ops_test.py
@@ -0,0 +1,102 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+class TestPutOps(TestCase):
+
+    def test_avg_put_ops(self):
+        put_value = 15.1111
+        magnitude_expand = 10000
+        stat_name = "a1".encode('ascii')
+        sum_postfix = "/stat_value/sum".encode("ascii")
+        count_postfix = "/stat_value/count".encode("ascii")
+
+        workspace.FeedBlob("value", np.array([put_value], dtype=np.float))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            "AveragePut",
+            "value",
+            [],
+            stat_name=stat_name,
+            magnitude_expand=magnitude_expand))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k', 'v', 't']))
+
+        k = workspace.FetchBlob('k')
+        v = workspace.FetchBlob('v')
+
+        stat_dict = dict(zip(k, v))
+
+        self.assertIn(stat_name + sum_postfix, stat_dict)
+        self.assertIn(stat_name + count_postfix, stat_dict)
+        self.assertEquals(stat_dict[stat_name + sum_postfix],
+         put_value * magnitude_expand)
+        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+
+    def test_increment_put_ops(self):
+        put_value = 15.1111
+        magnitude_expand = 10000
+        stat_name = "i1".encode('ascii')
+        member_postfix = "/stat_value".encode("ascii")
+
+        workspace.FeedBlob("value", np.array([put_value], dtype=np.float))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            "IncrementPut",
+            "value",
+            [],
+            stat_name=stat_name,
+            magnitude_expand=magnitude_expand))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k', 'v', 't']))
+
+        k = workspace.FetchBlob('k')
+        v = workspace.FetchBlob('v')
+
+        stat_dict = dict(zip(k, v))
+
+        self.assertIn(stat_name + member_postfix, stat_dict)
+        self.assertEquals(stat_dict[stat_name + member_postfix],
+         put_value * magnitude_expand)
+
+    def test_stddev_put_ops(self):
+        put_value = 15.1111
+        magnitude_expand = 10000
+        stat_name = "s1".encode('ascii')
+        sum_postfix = "/stat_value/sum".encode("ascii")
+        count_postfix = "/stat_value/count".encode("ascii")
+        sumoffset_postfix = "/stat_value/sumoffset".encode("ascii")
+        sumsqoffset_postfix = "/stat_value/sumsqoffset".encode("ascii")
+
+        workspace.FeedBlob("value", np.array([put_value], dtype=np.float))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            "StdDevPut",
+            "value",
+            [],
+            stat_name=stat_name,
+            magnitude_expand=magnitude_expand))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k', 'v', 't']))
+
+        k = workspace.FetchBlob('k')
+        v = workspace.FetchBlob('v')
+
+        stat_dict = dict(zip(k, v))
+
+        self.assertIn(stat_name + sum_postfix, stat_dict)
+        self.assertIn(stat_name + count_postfix, stat_dict)
+        self.assertIn(stat_name + sumoffset_postfix, stat_dict)
+        self.assertIn(stat_name + sumsqoffset_postfix, stat_dict)
+        self.assertEquals(stat_dict[stat_name + sum_postfix],
+            put_value * magnitude_expand)
+        self.assertEquals(stat_dict[stat_name + count_postfix], 1)

From 6ff568df4dc536e4dfbb090caa9db8b3a71f2bf1 Mon Sep 17 00:00:00 2001
From: Vlad Belous <vladislav@fb.com>
Date: Wed, 26 Sep 2018 13:22:17 -0700
Subject: [PATCH 04/82] Add full namespace resolution in CAFFE_DURATION
 (#12065)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12065

Had compilation issues using CAFFE_DURATION in some contexts, specifically due to namespace resolution. Since this is a macro, it should fully qualify.

Reviewed By: heslami

Differential Revision: D10036132

fbshipit-source-id: b8d55dfe5e991ca702ce5b7483f0ffc699882c85
---
 caffe2/core/stats.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h
index 86c6827e3039a1..f037ca6e175606 100644
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@@ -350,8 +350,8 @@ _ScopeGuard<T> ScopeGuard(T f) {
         ##__VA_ARGS__);                                             \
   }
 
-#define CAFFE_DURATION(stats, field, ...)                \
-  if (auto g = detail::ScopeGuard([&](int64_t nanos) {   \
-        CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__); \
+#define CAFFE_DURATION(stats, field, ...)                        \
+  if (auto g = ::caffe2::detail::ScopeGuard([&](int64_t nanos) { \
+        CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__);         \
       }))
 } // namespace caffe2

From 1b45f6839789f3849fbde01c4fd88e5034172c44 Mon Sep 17 00:00:00 2001
From: Syed Tousif Ahmed <syed.ahmed.emails@gmail.com>
Date: Wed, 26 Sep 2018 15:19:10 -0700
Subject: [PATCH 05/82] Use atomicAdd from cuda_fp16 header when building with
 CUDA 10 (#12108)

Summary:
An efficient atomicAdd for halfs has been added in `cuda_fp16.h` in CUDA 10:
```__CUDA_FP16_DECL__ __half atomicAdd(__half *address, __half val);```

Through this change, PyTorch will be able to utilize efficient atomicAdd when building with CUDA 10.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12108

Differential Revision: D10053385

Pulled By: soumith

fbshipit-source-id: 946c90691a8f6bdcf6d6e367a507ac3c9970b750
---
 aten/src/THC/THCAtomics.cuh | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index 756fa0f905ac13..8752b0df458cfd 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -96,19 +96,24 @@ static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
 }
 
 static inline  __device__ void atomicAdd(at::Half *address, at::Half val) {
-  unsigned int * address_as_ui =
-    (unsigned int *) ((char *)address - ((size_t)address & 2));
-  unsigned int old = *address_as_ui;
-  unsigned int assumed;
+  #if ((CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+    unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      at::Half hsum;
+      hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+      hsum = THCNumerics<at::Half>::add(hsum, val);
+      old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+  #else
+    atomicAdd(reinterpret_cast<__half*>(address), val);
+  #endif
 
-  do {
-    assumed = old;
-    at::Half hsum;
-    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    hsum = THCNumerics<at::Half>::add(hsum, val);
-    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
-    old = atomicCAS(address_as_ui, assumed, old);
-  } while (assumed != old);
 }
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)

From 75b1ae1acde83c8c05d4011a36ad82b73cc909ff Mon Sep 17 00:00:00 2001
From: Joel Marcey <joelm@fb.com>
Date: Wed, 26 Sep 2018 16:09:59 -0700
Subject: [PATCH 06/82] Update issue templates

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12114

Reviewed By: soumith

Differential Revision: D10060349

Pulled By: JoelMarcey

fbshipit-source-id: ed88bf95f78742b089adb043e88613a5db006a10
---
 .github/ISSUE_TEMPLATE/bug-report.md          | 49 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/documentation.md       |  9 ++++
 .github/ISSUE_TEMPLATE/feature-request.md     | 24 +++++++++
 .../ISSUE_TEMPLATE/questions-help-support.md  | 13 +++++
 4 files changed, 95 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug-report.md
 create mode 100644 .github/ISSUE_TEMPLATE/documentation.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature-request.md
 create mode 100644 .github/ISSUE_TEMPLATE/questions-help-support.md

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 00000000000000..712143336a1af7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,49 @@
+---
+name: "\U0001F41B Bug Report"
+about: Submit a bug report to help us improve PyTorch
+
+---
+
+## 🐛 Bug
+
+<!-- A clear and concise description of what the bug is. -->
+
+## To Reproduce
+
+Steps to reproduce the behavior:
+
+1.
+1.
+1.
+
+<!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
+
+## Expected behavior
+
+<!-- A clear and concise description of what you expected to happen. -->
+
+## Environment
+
+Please copy and paste the output from our
+[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
+(or fill out the checklist below manually).
+
+You can get the script and run it with:
+```
+wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+# For security purposes, please check the contents of collect_env.py before running it.
+python collect_env.py
+```
+
+ - PyTorch Version (e.g., 1.0):
+ - OS (e.g., Linux):
+ - How you installed PyTorch (`conda`, `pip`, source):
+ - Build command you used (if compiling from source):
+ - Python version:
+ - CUDA/cuDNN version:
+ - GPU models and configuration:
+ - Any other relevant information:
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 00000000000000..a699c2e4548f8a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,9 @@
+---
+name: "\U0001F4DA Documentation"
+about: Report an issue related to https://pytorch.org/docs
+
+---
+
+## 📚 Documentation
+
+<!-- A clear and concise description of what content in https://pytorch.org/docs is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new -->
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
new file mode 100644
index 00000000000000..e1d2bc306eae8c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -0,0 +1,24 @@
+---
+name: "\U0001F680Feature Request"
+about: Submit a proposal/request for a new PyTorch feature
+
+---
+
+## 🚀 Feature
+<!-- A clear and concise description of the feature proposal -->
+
+## Motivation
+
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
+
+## Pitch
+
+<!-- A clear and concise description of what you want to happen. -->
+
+## Alternatives
+
+<!-- A clear and concise description of any alternative solutions or features you've considered, if any. -->
+
+## Additional context
+
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/.github/ISSUE_TEMPLATE/questions-help-support.md b/.github/ISSUE_TEMPLATE/questions-help-support.md
new file mode 100644
index 00000000000000..77bfb55b9a468a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/questions-help-support.md
@@ -0,0 +1,13 @@
+---
+name: "❓Questions/Help/Support"
+about: Do you need support? We have resources.
+
+---
+
+## ❓ Questions and Help
+
+### Please note that this issue tracker is not a help form and this issue will be closed.
+
+We have a set of [listed resources available on the website](https://pytorch.org/resources). Our primary means of support is our discussion forum:
+
+- [Discussion Forum](https://discuss.pytorch.org/)

From 478803a75fb2615e46293eec168dd941c65b3223 Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Wed, 26 Sep 2018 16:55:07 -0700
Subject: [PATCH 07/82] Introduce type variables to implement generic list
 operators (#12040)

Summary:
We generate specialized list operations for int, float, and Tensor lists so that small lists of integers like the arguments to conv do not involve tons of boxing code.

This PR adds a fallback GenericList for List types that contain any other type. It does so by adding type variables to `jit::Type`, and machinery for matching/replacing the type variables during `tryMatchSchema` and operator lookup.

It also modifies the builtin list ops to include a fallback that works on a GenericList object that simply holds IValues. This is distinguished from IValue's tuple type so that conversion to/from Python still happens losslessly.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12040

Differential Revision: D10037098

Pulled By: zdevito

fbshipit-source-id: 0c5f2864d12e7d33554bf34cc29e5fb700dde150
---
 aten/src/ATen/core/ivalue.cpp        | 16 +++--
 aten/src/ATen/core/ivalue.h          | 55 ++++++++++++++--
 test/test_jit.py                     | 23 ++++---
 torch/csrc/jit/argument_spec.h       |  1 -
 torch/csrc/jit/export.cpp            |  2 +
 torch/csrc/jit/import.cpp            |  4 +-
 torch/csrc/jit/operator.cpp          | 19 +++++-
 torch/csrc/jit/pybind_utils.h        |  9 +++
 torch/csrc/jit/python_ir.cpp         |  2 +
 torch/csrc/jit/register_prim_ops.cpp | 52 ++++++++-------
 torch/csrc/jit/script/compiler.cpp   | 96 ++++++++++++++--------------
 torch/csrc/jit/script/compiler.h     |  8 ++-
 torch/csrc/jit/script/init.cpp       | 10 +--
 torch/csrc/jit/script/module.cpp     |  8 +--
 torch/csrc/jit/type.cpp              | 69 ++++++++++++++++++++
 torch/csrc/jit/type.h                | 64 ++++++++++++++++++-
 16 files changed, 328 insertions(+), 110 deletions(-)

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 8dfb1e8ebb75b6..8077f935ae8242 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -1,10 +1,18 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/core/Formatting.h>
 
-#define TORCH_FORALL_TAGS(_)                                             \
-  _(None)                                                                \
-  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
-      _(TensorList) _(Blob)
+#define TORCH_FORALL_TAGS(_) \
+  _(None) \
+  _(Tensor) \
+  _(Double) \
+  _(Int) \
+  _(Tuple) \
+  _(IntList) \
+  _(DoubleList) \
+  _(String) \
+  _(TensorList) \
+  _(Blob) \
+  _(GenericList)
 
 namespace torch { namespace jit {
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 513845d4c12af0..5613dc42357fa9 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -35,10 +35,11 @@ struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
 
 // non-mutable list
 template <typename Elem>
-struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target {
+struct C10_EXPORT ConstantList : c10::intrusive_ptr_target {
  private:
   const std::vector<Elem> elements_;
  public:
+  typedef Elem ElemType;
   ConstantList(std::vector<Elem> elements_)
   : elements_(std::move(elements_)) {}
   static c10::intrusive_ptr<ConstantList<Elem>> create(std::vector<Elem> elements_) {
@@ -53,10 +54,16 @@ struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target {
 };
 
 struct IValue;
-using Tuple = ConstantList<IValue>;
+struct C10_EXPORT Tuple : public ConstantList<IValue> {
+  using ConstantList<IValue>::ConstantList;
+  static c10::intrusive_ptr<Tuple> create(std::vector<IValue> elements_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_));
+  }
+};
 using IntList = ConstantList<int64_t>;
 using TensorList = ConstantList<at::Tensor>;
 using DoubleList = ConstantList<double>;
+using GenericList = ConstantList<IValue>;
 
 // IValue is the generic tagged union used by the interpreter to hold
 // all value types.
@@ -65,10 +72,18 @@ using DoubleList = ConstantList<double>;
 // to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
 // retain/release calls.
 
-#define TORCH_FORALL_TAGS(_)                                             \
-  _(None)                                                                \
-  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
-      _(TensorList) _(Blob)
+#define TORCH_FORALL_TAGS(_) \
+  _(None) \
+  _(Tensor) \
+  _(Double) \
+  _(Int) \
+  _(Tuple) \
+  _(IntList) \
+  _(DoubleList) \
+  _(String) \
+  _(TensorList) \
+  _(Blob) \
+  _(GenericList)
 
 struct CAFFE2_API IValue final {
   IValue()
@@ -207,6 +222,7 @@ struct CAFFE2_API IValue final {
   const std::vector<int64_t>& toIntListRef() const;
   const std::vector<double>& toDoubleListRef() const;
   const std::vector<at::Tensor>& toTensorListRef() const;
+  const std::vector<IValue>& toGenericListRef() const;
 
   // ConstantString
   IValue(c10::intrusive_ptr<ConstantString> v);
@@ -247,6 +263,19 @@ struct CAFFE2_API IValue final {
     return toIntrusivePtr<TensorList>();
   }
 
+  //GenericList
+  IValue(c10::intrusive_ptr<GenericList> v);
+  IValue(std::vector<IValue> v);
+  bool isGenericList() const { return Tag::GenericList == tag; }
+  c10::intrusive_ptr<GenericList> toGenericList() && {
+    AT_ASSERT(isGenericList());
+    return moveToIntrusivePtr<GenericList>();
+  }
+  c10::intrusive_ptr<GenericList> toGenericList() const & {
+    AT_ASSERT(isGenericList());
+    return toIntrusivePtr<GenericList>();
+  }
+
   // None
   bool isNone() {
     return Tag::None == tag;
@@ -362,12 +391,14 @@ DEFINE_TO(int64_t, toInt)
 DEFINE_TO(c10::intrusive_ptr<DoubleList>, toDoubleList)
 DEFINE_TO(c10::intrusive_ptr<IntList>, toIntList)
 DEFINE_TO(c10::intrusive_ptr<TensorList>, toTensorList)
+DEFINE_TO(c10::intrusive_ptr<GenericList>, toGenericList)
 DEFINE_TO(c10::intrusive_ptr<ConstantString>, toString)
 DEFINE_TO(at::Scalar, toScalar)
 DEFINE_TO(bool, toInt)
 DEFINE_TO(std::vector<int64_t>, toIntListRef)
 DEFINE_TO(std::vector<double>, toDoubleListRef)
 DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
+DEFINE_TO(std::vector<IValue>, toGenericListRef)
 
 #undef DEFINE_TO
 
@@ -433,6 +464,14 @@ inline IValue::IValue(c10::intrusive_ptr<TensorList> v)
 inline IValue::IValue(std::vector<at::Tensor> v)
 : IValue(TensorList::create(std::move(v))) {}
 
+inline IValue::IValue(c10::intrusive_ptr<GenericList> v)
+: tag(Tag::GenericList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<IValue> v)
+: IValue(GenericList::create(std::move(v))) {}
+
+
 inline const std::vector<int64_t>& IValue::toIntListRef() const {
   return toIntList()->elements();
 }
@@ -445,5 +484,9 @@ inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
   return toTensorList()->elements();
 }
 
+inline const std::vector<IValue>& IValue::toGenericListRef() const {
+  return toGenericList()->elements();
+}
+
 
 }}
diff --git a/test/test_jit.py b/test/test_jit.py
index a448362b470bbf..84313a07dff364 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -2645,18 +2645,23 @@ def stuff3(x):
             return torch.ones(x), x
         self.checkScript(stuff3, ([3, 2],))
 
-    def test_nested_list_error(self):
-        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
-            @torch.jit.script
-            def foo(x):
-                # type: (Tuple[List[List[int]]]) -> int
-                return 4
+    def test_nested_list(self):
+        def foo(z):
+            # type: (Tuple[int, List[List[int]]]) -> int
+            x, y = z
+            return y[0][1]
+        self.checkScript(foo, ((1, [[1, 2], [3, 4]]),))
+
+    def test_nested_list_construct(self):
+        def foo():
+            return [[4]] + [[4, 5]]
+        self.checkScript(foo, ())
 
-    def test_nested_list_construct_error(self):
-        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
+    def test_generic_list_errors(self):
+        with self.assertRaisesRegex(RuntimeError, "previously matched to type"):
             @torch.jit.script
             def foo(x):
-                return [[4]]
+                return [[x]] + [[1]]
 
     def test_script_cu(self):
         cu = torch.jit.CompilationUnit('''
diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
index 10b0cad6749128..2d1b9f7b147abb 100644
--- a/torch/csrc/jit/argument_spec.h
+++ b/torch/csrc/jit/argument_spec.h
@@ -61,7 +61,6 @@ static_assert(sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type),
 struct ArgumentSpec {
   ArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs, size_t num_flat_inputs) {
     hash_code = num_flat_inputs;
-
     args.resize(num_flat_inputs);
     size_t offset = 0;
     for (size_t i = 0; i < inputs.size(); ++i) {
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 437d0f6c779972..c993aa05eb9a04 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -564,6 +564,8 @@ void ModuleEncoder::EncodeTypeInfo(
     type_proto->set_denotation("GeneratorType");
   } else if (kind == TypeKind::StringType) {
     type_proto->set_denotation("StringType");
+  } else if (kind == TypeKind::VarType) {
+    type_proto->set_denotation("TypeVar:" + type->expect<VarType>()->name());
   } else {
     throw std::runtime_error("unexpected type kind");
   }
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index b2fa6eba2f748a..c03e3d80e1e06f 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -260,8 +260,10 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) {
     return NoneType::get();
   } else if (kind == "GeneratorType") {
     return GeneratorType::get();
-  }else if (kind == "StringType") {
+  } else if (kind == "StringType") {
     return StringType::get();
+  } else if (kind.find("TypeVar:") == 0) {
+    return VarType::create(kind.substr(strlen("TypeVar:")));
   } else {
     throw std::runtime_error("unexpected string for type kind");
   }
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index 75e5833535bcfc..6a4a699c24f9ad 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -62,8 +62,14 @@ struct SchemaParser {
     auto tok = L.expect(TK_IDENT);
     auto text = tok.text();
     auto it = type_map.find(text);
-    if(it == type_map.end())
+    if(it == type_map.end()) {
+      if(text.size() > 0 && islower(text[0])) {
+        // lower case identifiers that are not otherwise valid types
+        // are treated as type variables
+        return VarType::create(text);
+      }
       throw ErrorReport(tok.range) << "unknown type specifier";
+    }
     return it->second;
   }
   void parseArgumentType(std::vector<Argument>& arguments) {
@@ -358,9 +364,16 @@ bool Operator::matches(const Node* node) const {
   if(actuals.size() < formals.size())
     return false;
 
+
+  TypeEnv type_env;
   for(size_t i = 0; i < formals.size(); ++i) {
-    // mismatched input type
-    if (!actuals[i]->type()->isSubtypeOf(formals[i].type)) {
+    try {
+      TypePtr formal = matchTypeVariables(formals[i].type, actuals[i]->type(), type_env);
+      // mismatched input type
+      if (!actuals[i]->type()->isSubtypeOf(formal)) {
+        return false;
+      }
+    } catch(TypeMatchError& err) {
       return false;
     }
   }
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index d1420574bf6bcb..c78b268cb2da75 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -142,6 +142,7 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
       }
       case TypeKind::NumberType:
       case TypeKind::GeneratorType:
+      case TypeKind::VarType:
         break;
     }
   AT_ERROR("Missing cases in toIValue for type: ", type->str(), "! File a bug report.");
@@ -199,6 +200,14 @@ inline py::object toPyObject(IValue&& ivalue) {
     return py::cast(ivalue.toDoubleListRef());
   } else if (ivalue.isTensorList()) {
     return py::cast(ivalue.toTensorListRef());
+  } else if (ivalue.isGenericList()) {
+    auto list = ivalue.toGenericList();
+    const auto & elements = list->elements();
+    py::list t { elements.size() };
+    for (size_t i = 0; i < elements.size(); ++i) {
+      t[i] = toPyObject(IValue{elements[i]});
+    }
+    return t;
   } else if (ivalue.isTuple()) {
     auto tuple = ivalue.toTuple();
     const auto & elements = tuple->elements();
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 5aa053f626faa1..0db6f9a394459c 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -455,6 +455,8 @@ void initPythonIRBindings(PyObject * module_) {
           return "StringType";
         case TypeKind::GeneratorType:
           return "GeneratorType";
+        case TypeKind::VarType:
+          return "VarType";
         }
         // not reachable, but some compilers complain
         AT_ERROR("Unknown Type Kind");
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 71168cd3ee3d4d..574d9ca1446396 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -399,9 +399,17 @@ RegisterOperators reg({
               return 0;
             };
           } else {
-            std::stringstream ss;
-            ss << "unsupported list type: " << *lt->getElementType();
-            throw std::runtime_error(ss.str());
+            return [=](Stack& stack) {
+              const size_t stack_size = stack.size();
+              std::vector<IValue> vals;
+              vals.reserve(num_inputs);
+              for (size_t i = stack_size - num_inputs; i < stack_size; ++i) {
+                vals.push_back(std::move(stack[i]));
+              }
+              drop(stack, num_inputs);
+              push(stack, std::move(vals));
+              return 0;
+            };
           }
         }),
 });
@@ -506,11 +514,7 @@ Operation listEq(Node* node) {
     T a;
     T b;
     pop(stack, a, b);
-    if (a->elements() == b->elements()) {
-      push(stack, 1);
-    } else {
-      push(stack, 0);
-    }
+    push(stack, a->elements() == b->elements() ? 1 : 0);
     return 0;
   };
 }
@@ -604,31 +608,25 @@ Operation listSlice(Node* node) {
 }
 
 RegisterOperators reg2({
-    Operator("aten::select(int[] a, int b) -> int", listSelect<Shared<IntList>>),
-    Operator("aten::select(float[] a, int b) -> float", listSelect<Shared<DoubleList>>),
-    Operator("aten::select(Tensor[] a, int b) -> Tensor", listSelect<Shared<TensorList>>),
 
-    Operator("aten::len(int[] a) -> int", listLen<Shared<IntList>>),
-    Operator("aten::len(float[] a) -> int", listLen<Shared<DoubleList>>),
-    Operator("aten::len(Tensor[] a) -> int", listLen<Shared<TensorList>>),
+#define CREATE_LIST_OPS(decl_type, c_type) \
+    Operator("aten::select(" decl_type "[] a, int b) -> " decl_type, listSelect<Shared<c_type>>), \
+    Operator("aten::len(" decl_type "[] a) -> int", listLen<Shared<c_type>>), \
+    Operator("aten::add(" decl_type "[] a, " decl_type "[] b) -> " decl_type "[]", listAdd<Shared<c_type>, c_type::ElemType>), \
+    Operator( \
+        "aten::slice(" decl_type "[] l, int start, int end=9223372036854775807, int step=1) -> " decl_type "[]", \
+        listSlice<Shared<c_type>, c_type::ElemType>),
+
+
+    CREATE_LIST_OPS("int", IntList)
+    CREATE_LIST_OPS("float", DoubleList)
+    CREATE_LIST_OPS("Tensor", TensorList)
+    CREATE_LIST_OPS("t", GenericList)
 
     Operator("aten::eq(int[] a, int[] b) -> int", listEq<Shared<IntList>>),
     Operator("aten::eq(float[] a, float[] b) -> int", listEq<Shared<DoubleList>>),
     Operator("aten::eq(Tensor[] a, Tensor[] b) -> int", listEq<Shared<TensorList>>),
 
-    Operator("aten::add(int[] a, int[] b) -> int[]", listAdd<Shared<IntList>, int64_t>),
-    Operator("aten::add(float[] a, float[] b) -> float[]", listAdd<Shared<DoubleList>, double>),
-    Operator("aten::add(Tensor[] a, Tensor[] b) -> Tensor[]", listAdd<Shared<TensorList>, at::Tensor>),
-
-    Operator(
-        "aten::slice(int[] l, int start, int end=9223372036854775807, int step=1) -> int[]",
-        listSlice<Shared<IntList>, int64_t>),
-    Operator(
-        "aten::slice(float[] l, int start, int end=9223372036854775807, int step=1) -> float[]",
-        listSlice<Shared<DoubleList>, double>),
-    Operator(
-        "aten::slice(Tensor[] l, int start, int end=9223372036854775807, int step=1) -> Tensor[]",
-        listSlice<Shared<TensorList>, at::Tensor>),
 
     DEFINE_BINARY_OP(aten::add, a + b)
     DEFINE_BINARY_OP(aten::sub, a - b)
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index b66b96dd5eb6fb..384cc167735e55 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -449,7 +449,8 @@ Value* tryMatchArgument(
     const SourceRange& loc,
     const NamedValue& named_value,
     std::function<std::ostream&()> err,
-    bool convert_tensors_to_nums) {
+    bool convert_tensors_to_nums,
+    TypeEnv & type_env) {
   Value* value = named_value.value(graph);
 
   // some functions that take lists of integers for fixed size arrays
@@ -460,35 +461,44 @@ Value* tryMatchArgument(
     value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
   }
 
+  TypePtr concrete_type;
+  try {
+    concrete_type = matchTypeVariables(arg.type, value->type(), type_env);
+  } catch(TypeMatchError& e) {
+    err() << "could not match type " << value->type()->str() << " to "
+          << arg.type->str() << " in argument '" << arg.name << "': " << e.what() << "\n"
+          << named_value.locOr(loc);
+    return nullptr;
+  }
+
   // Allow homogeneous tuples to be casted implicitly to lists of appropriate types
-  if (convertibleToList(value->type(), arg.type) &&
+  if (convertibleToList(value->type(), concrete_type) &&
       value->type()->kind() == TypeKind::TupleType) {
     auto unpacked = createTupleUnpack(value);
-    auto elem_type = arg.type->expect<ListType>()->getElementType();
+    auto elem_type = concrete_type->expect<ListType>()->getElementType();
     value = graph.insertNode(graph.createList(elem_type, unpacked))->output();
   }
 
   if (value->node()->kind() == prim::None){
-    if (arg.type->isSubtypeOf(NumberType::get()))
+    if (concrete_type->isSubtypeOf(NumberType::get()))
       value = graph.insertConstant(at::Scalar(NAN), loc);
-    else if (arg.type->isSubtypeOf(GeneratorType::get())) {
-      value = graph.insertNode(graph.createNoneGenerator())
-        ->output()->setType(GeneratorType::get());
+    else if (concrete_type->isSubtypeOf(GeneratorType::get())) {
+      value = graph.insertNode(graph.createNoneGenerator())->output();
     } else
       value = graph.insertNode(graph.createUndefined())->output();
   }
 
   //implicit conversion of tensors to scalars
-  if(convert_tensors_to_nums && arg.type->isSubtypeOf(NumberType::get())
+  if(convert_tensors_to_nums && concrete_type->isSubtypeOf(NumberType::get())
       && value->type()->isSubtypeOf(DynamicType::get())) {
-      auto n = graph.createImplicitTensorToNum(arg.type, value);
+      auto n = graph.createImplicitTensorToNum(concrete_type, value);
       value = graph.insertNode(n)
         ->setSourceLocation(std::make_shared<SourceRange>(loc))
         ->output();
   }
 
-  if(!value->type()->isSubtypeOf(arg.type)) {
-    err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found "
+  if(!value->type()->isSubtypeOf(concrete_type)) {
+    err() << "expected a value of type " << concrete_type->str() << " for argument '" << arg.name << "' but found "
           << value->type()->str() << "\n"
           << named_value.locOr(loc);
     return nullptr;
@@ -510,11 +520,12 @@ Value* tryCreateList(
     const SourceRange& loc,
     at::ArrayRef<NamedValue> varargs,
     std::function<std::ostream&()> err,
-    bool convert_tensor_to_num) {
-  Argument elem_arg("", elem_type);
+    bool convert_tensor_to_num,
+    TypeEnv & type_env) {
+  Argument elem_arg("<varargs>", elem_type);
   std::vector<Value*> list_ctor;
   for(const auto& a : varargs) {
-    Value* av = tryMatchArgument(elem_arg, graph, loc, a, err, convert_tensor_to_num);
+    Value* av = tryMatchArgument(elem_arg, graph, loc, a, err, convert_tensor_to_num, type_env);
     if(!av)
       return nullptr;
     list_ctor.push_back(av);
@@ -537,7 +548,7 @@ static Value* materializeConstant(T val, Graph& graph,
   return new_constant;
 }
 
-at::optional<std::vector<Value*>> tryMatchSchema(
+at::optional<MatchedSchema> tryMatchSchema(
   const FunctionSchema& schema,
   const SourceRange& loc,
   Graph& graph,
@@ -550,6 +561,7 @@ at::optional<std::vector<Value*>> tryMatchSchema(
       return failure_messages;
     };
 
+    TypeEnv type_env;
     std::vector<Value*> positional_inputs;
     std::vector<bool> used_kwarg(kwargs.size(), false);
 
@@ -564,16 +576,18 @@ at::optional<std::vector<Value*>> tryMatchSchema(
         // allow zeros(IntList sizes) to work with zeros(1, 2) or zeros(1)
         if (arg.type->kind() == TypeKind::ListType && // the formal must be a list
             !arg.N && // it must not be a broadcasting list like int[3], otherwise a single int is a valid input
-            (schema_i + 1 == schema.arguments.size() || schema.arguments[schema_i + 1].kwarg_only) &&  // must be the last position argument
-            !convertibleToList(args[schema_i].value(graph)->type(), arg.type)) { // and the actual should not be a list already
-          auto elem_type = arg.type->expect<ListType>()->getElementType();
-          Value* list = tryCreateList(elem_type, graph, loc, args.slice(schema_i),
-            err, convert_tensors_to_nums);
-          if(!list)
-            return at::nullopt;
-          used_args = args.size();
-          positional_inputs.push_back(list);
-          continue;
+            (schema_i + 1 == schema.arguments.size() || schema.arguments[schema_i + 1].kwarg_only)) {  // must be the last position argument
+          auto actual_type = args[schema_i].value(graph)->type();
+          if (actual_type->kind() != TypeKind::ListType && !convertibleToList(actual_type, arg.type)) { // and the actual should not be a list already
+            auto elem_type = arg.type->expect<ListType>()->getElementType();
+            Value* list = tryCreateList(elem_type, graph, loc, args.slice(schema_i),
+              err, convert_tensors_to_nums, type_env);
+            if(!list)
+              return at::nullopt;
+            used_args = args.size();
+            positional_inputs.push_back(list);
+            continue;
+          }
         }
 
         v = args[schema_i];
@@ -592,7 +606,7 @@ at::optional<std::vector<Value*>> tryMatchSchema(
         err() << "argument " << schema.arguments[schema_i].name << " not provided.\n" << loc;
         return at::nullopt;
       }
-      Value * positional = tryMatchArgument(arg, graph, loc, *v, err, convert_tensors_to_nums);
+      Value * positional = tryMatchArgument(arg, graph, loc, *v, err, convert_tensors_to_nums, type_env);
       if(!positional)
         return at::nullopt;
       positional_inputs.push_back(positional);
@@ -616,7 +630,10 @@ at::optional<std::vector<Value*>> tryMatchSchema(
         return at::nullopt;
       }
     }
-    return positional_inputs;
+    auto return_types = fmap(schema.returns, [&](const Argument& r) {
+      return evalTypeVariables(r.type, type_env);
+    });
+    return MatchedSchema {std::move(positional_inputs), std::move(return_types) };
 }
 
 
@@ -630,17 +647,17 @@ static Value* tryEmitBuiltin(
   at::ArrayRef<NamedValue> attributes,
   bool convert_tensors_to_nums) {
 
-  auto matched_inputs = tryMatchSchema(op->schema(), loc, graph, inputs, attributes,
+  auto matched_schema = tryMatchSchema(op->schema(), loc, graph, inputs, attributes,
     failure_messages, convert_tensors_to_nums);
-  if(!matched_inputs)
+  if(!matched_schema)
     return nullptr;
   // we successfully matched this schema, construct the node
 
-  auto n = graph.insertNode(graph.create(name, *matched_inputs, 0))
+  auto n = graph.insertNode(graph.create(name, matched_schema->inputs, 0))
                 ->setSourceLocation(std::make_shared<SourceRange>(loc));
 
-  for(auto & ret : op->schema().returns) {
-    n->addOutput()->setType(ret.type);
+  for(auto & ret : matched_schema->return_types) {
+    n->addOutput()->setType(ret);
   }
 
   // assert that we did indeed create an op that has implementation
@@ -728,19 +745,6 @@ inline bool isSupportedListElementType(TypePtr type) {
       type->isSubtypeOf(NumberType::get());
 }
 
-// guard for List types we do not currently have operations for
-inline void ensureLegalType(const SourceRange& range, TypePtr ptr) {
-  if(TupleTypePtr tt = ptr->cast<TupleType>()) {
-    for(auto elem : tt->elements()) {
-      ensureLegalType(range, elem);
-    }
-  } else if(ListTypePtr lt = ptr->cast<ListType>()) {
-    if(!isSupportedListElementType(lt->getElementType())) {
-        throw ErrorReport(range) << "Lists can only contain numbers or Tensors, but found " << lt->getElementType()->str();
-    }
-  }
-}
-
 struct to_ir {
   to_ir(
       Def def,
@@ -791,7 +795,6 @@ struct to_ir {
       // Record the type for the schema and set the Type on the Value*
       arguments.push_back(schema.arguments.at(arg_annotation_idx++));
       new_input->setType(arguments.back().type);
-      ensureLegalType((*it).ident().range(), arguments.back().type);
     }
     // body
     auto stmts = def.statements();
@@ -1577,7 +1580,6 @@ struct to_ir {
         }
         Value* result = graph->insertNode(graph->createList(elem_type, values))
             ->output();
-        ensureLegalType(tree->range(), result->type());
         return result;
       } break;
       case TK_TUPLE_LITERAL: {
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
index deef6a5c2ca8f5..745137d1a9ad05 100644
--- a/torch/csrc/jit/script/compiler.h
+++ b/torch/csrc/jit/script/compiler.h
@@ -164,7 +164,13 @@ TORCH_API void ensureTensors(const SourceRange& range, at::ArrayRef<Value*> valu
 // if it returns nullopt, then failure_messages contains a good error report
 // set convert_tensor_to_num to true if ImplicitTensorToNums should be inserted to
 // match the schema
-TORCH_API at::optional<std::vector<Value*>> tryMatchSchema(
+
+struct MatchedSchema {
+  std::vector<Value*> inputs;
+  std::vector<TypePtr> return_types;
+};
+
+TORCH_API at::optional<MatchedSchema> tryMatchSchema(
   const FunctionSchema& schema,
   const SourceRange& loc,
   Graph& graph,
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index f0dfda81cc0926..abc0560bbc1464 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -114,9 +114,9 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
     auto schema = getSchema(inputs.size(), n_binders);
 
     std::stringstream failure_messages;
-    at::optional<std::vector<Value*>> all_inputs =
+    at::optional<MatchedSchema> matched_schema =
       tryMatchSchema(schema, loc, *m.graph(), inputs_, attributes, failure_messages, /*conv_tensor_to_num*/true);
-    if (!all_inputs)
+    if (!matched_schema)
       throw ErrorReport(loc) << failure_messages.str();
 
     // Release the function object so we can wrap it in a PythonOp
@@ -125,12 +125,12 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
     Node* new_node = m.graph()->insertNode(m.graph()->createPythonOp(
       THPObjectPtr(func.release().ptr()), cconv, {}));
     new_node->setSourceLocation(std::make_shared<SourceRange>(loc));
-    for(auto &i : *all_inputs)
+    for(auto &i : matched_schema->inputs)
       new_node->addInput(i);
 
     std::vector<Value*> outputs;
-    for(auto & ret_arg : schema.returns) {
-      outputs.push_back(new_node->addOutput()->setType(ret_arg.type));
+    for(auto & ret_arg : matched_schema->return_types) {
+      outputs.push_back(new_node->addOutput()->setType(ret_arg));
     }
     return std::make_shared<SimpleValue>(packOutputs(*m.graph(), outputs));
   }
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
index b1f6a6e220bbc9..4180c93f48c837 100644
--- a/torch/csrc/jit/script/module.cpp
+++ b/torch/csrc/jit/script/module.cpp
@@ -48,18 +48,18 @@ std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, Array
   auto fn = callee.graph();
 
   std::stringstream failure_messages;
-  auto all_inputs = tryMatchSchema(
+  auto matched_schema = tryMatchSchema(
     callee.getSchema(),
     loc, *graph(), args, kwargs, failure_messages, /*conv_tensors_to_nums*/true);
-  if(!all_inputs)
+  if(!matched_schema)
     throw ErrorReport(loc) << failure_messages.str();
 
   // parameters to callee method (which become parameters to _this_ method
   // if they were not already)
   for(at::Tensor* member : callee.member_inputs) {
-    all_inputs->push_back(get_or_add_parameter(member));
+    matched_schema->inputs.push_back(get_or_add_parameter(member));
   }
-  return inlineCallTo(*graph(), *callee.graph(), *all_inputs);
+  return inlineCallTo(*graph(), *callee.graph(), matched_schema->inputs);
 }
 
 void Method::ensure_defined() {
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index a4ada2647af7f5..bc559d8868daae 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -55,6 +55,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << "string";
   } else if(t.kind() == TypeKind::GeneratorType) {
     out << "Generator";
+  } else if(t.kind() == TypeKind::VarType) {
+    out << t.expect<VarType>()->name();
   } else {
     AT_ERROR("unknown type kind");
   }
@@ -170,4 +172,71 @@ at::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
   return at::nullopt;
 }
 
+TypePtr matchTypeVariables(TypePtr formal, TypePtr actual, TypeEnv& type_env) {
+  if(!formal->hasFreeVariables())
+    return formal;
+  if(auto vt = formal->cast<VarType>()) {
+    auto it = type_env.find(vt->name());
+    if(it == type_env.end()) {
+      type_env[vt->name()] = actual;
+      return actual;
+    } else if(auto unified = unifyTypes(it->second, actual)) {
+      type_env[vt->name()] = *unified;
+      return *unified;
+    }
+    std::stringstream ss;
+    ss << "type variable '" << vt->name() <<"' previously matched to type " <<
+      it->second->str() << " is matched to type " << actual->str();
+    throw TypeMatchError(ss.str());
+  } else if(auto lt_formal = formal->cast<ListType>()) {
+    if(auto lt_actual = actual->cast<ListType>()) {
+      return ListType::create(matchTypeVariables(lt_formal->getElementType(), lt_actual->getElementType(), type_env));
+    } else {
+      std::stringstream ss;
+      ss << "cannot match a list to " << actual->str();
+      throw TypeMatchError(ss.str());
+    }
+  } else if(auto tp_formal = formal->cast<TupleType>()) {
+    if(auto tp_actual = actual->cast<TupleType>()) {
+      if(tp_formal->elements().size() != tp_actual->elements().size()) {
+        std::stringstream ss;
+        throw TypeMatchError("cannot match tuples of mismatched size");
+      }
+      std::vector<TypePtr> elements;
+      for(size_t i = 0; i < tp_formal->elements().size(); ++i) {
+        TypePtr result = matchTypeVariables(
+            tp_formal->elements()[i],
+            tp_actual->elements()[i],
+            type_env);
+        elements.push_back(result);
+      }
+      return TupleType::create(std::move(elements));
+    } else {
+      std::stringstream ss;
+      ss << "cannot match a tuple to " << actual->str();
+      throw TypeMatchError(ss.str());
+    }
+  }
+  AT_ERROR("unhandled free variable container: ", formal->str());
+}
+
+// change return types like List[List[t]] into List[List[int]]
+TORCH_API TypePtr evalTypeVariables(TypePtr type, std::unordered_map<std::string, TypePtr>& type_env) {
+  if(!type->hasFreeVariables())
+    return type;
+
+  if(auto vt = type->cast<VarType>()) {
+    auto it = type_env.find(vt->name());
+    AT_ASSERTM(it != type_env.end(), "schema has unbound type variable '", vt->name(), "' in its return type");
+    return it->second;
+  } else if(auto lt = type->cast<ListType>()) {
+    return ListType::create(evalTypeVariables(lt->getElementType(), type_env));
+  } else if(auto tp = type->cast<TupleType>()) {
+    return TupleType::create(fmap(tp->elements(), [&](const TypePtr& typ) {
+      return evalTypeVariables(typ, type_env);
+    }));
+  }
+  return type;
+}
+
 }} // namespace torch::jit
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index 96e9f45496a34b..b519c6f710b21f 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -27,6 +27,7 @@ _(IntType) \
 _(NoneType) \
 _(StringType) \
 _(GeneratorType) \
+_(VarType)
 
 enum class TypeKind {
 #define DEFINE_TYPE(T) T,
@@ -133,6 +134,9 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     return r;
   }
   virtual ~Type() = default;
+  virtual bool hasFreeVariables() const {
+    return false;
+  }
 };
 
 inline bool operator!=(const Type & lhs, const Type & rhs) {
@@ -400,6 +404,9 @@ struct TORCH_API ListType : public Type {
   TypePtr getElementType() const {
     return elem;
   }
+  bool hasFreeVariables() const override {
+    return has_free_variables_;
+  }
   // common cast List[Tensor]
   static ListTypePtr ofTensors();
   static ListTypePtr ofInts();
@@ -408,8 +415,11 @@ struct TORCH_API ListType : public Type {
   static const TypeKind Kind = TypeKind::ListType;
 private:
   ListType(TypePtr elem)
-  : Type(TypeKind::ListType), elem(std::move(elem)) {}
+  : Type(TypeKind::ListType)
+  , elem(std::move(elem))
+  , has_free_variables_(getElementType()->hasFreeVariables()) {}
   TypePtr elem;
+  bool has_free_variables_;
 };
 
 struct TupleType;
@@ -461,12 +471,20 @@ struct TORCH_API TupleType : public Type {
     ss << "]";
     return ss.str();
   }
+  bool hasFreeVariables() const override {
+    return has_free_variables_;
+  }
 
   static const TypeKind Kind = TypeKind::TupleType;
 private:
   TupleType(std::vector<TypePtr> elements_)
   : Type(TypeKind::TupleType)
-  , elements_(std::move(elements_)) {}
+  , elements_(std::move(elements_)) {
+    has_free_variables_ =
+        std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) {
+          return v->hasFreeVariables();
+        });
+  }
 
   bool compare(const Type& rhs, std::function<bool(const TypePtr, const TypePtr)> fn) const {
     if(rhs.kind() != kind())
@@ -482,6 +500,7 @@ struct TORCH_API TupleType : public Type {
     return true;
   }
   std::vector<TypePtr> elements_;
+  bool has_free_variables_;
 };
 
 struct NumberType;
@@ -631,6 +650,34 @@ struct GeneratorType : public Type {
 };
 
 
+// a type variable, used in FunctionSchema
+struct VarType;
+using VarTypePtr = std::shared_ptr<VarType>;
+struct VarType : public Type {
+  static constexpr bool is_singleton = false;
+  template<typename ... T>
+  static VarTypePtr create(std::string name_) {
+    return VarTypePtr(new VarType(std::move(name_)));
+  }
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return name();
+  }
+  static const TypeKind Kind = TypeKind::VarType;
+  const std::string& name() const {
+    return name_;
+  }
+  bool hasFreeVariables() const override {
+    return true;
+  }
+private:
+  VarType(std::string name_)
+  : Type(TypeKind::VarType), name_(name_) {}
+  std::string name_;
+};
+
 TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t);
 // what is the type, ignoring extra size/shape information?
 // e.g. Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...)
@@ -689,4 +736,17 @@ template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::
 
 TORCH_API TypePtr inferTypeFrom(const IValue& value);
 
+struct TORCH_API TypeMatchError : public std::exception {
+  TypeMatchError(std::string msg_)
+  : msg_(std::move(msg_)) {}
+  const char * what() const noexcept override {
+    return msg_.c_str();
+  }
+private:
+  std::string msg_;
+};
+using TypeEnv = std::unordered_map<std::string, TypePtr>;
+TORCH_API TypePtr matchTypeVariables(TypePtr formal, TypePtr actual, TypeEnv & type_env);
+TORCH_API TypePtr evalTypeVariables(TypePtr type, TypeEnv & type_env);
+
 }} // namespace torch::jit

From db5f8d42bbdddf680f2d54f9c2e1c7afccec56b3 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 26 Sep 2018 16:57:22 -0700
Subject: [PATCH 08/82] Remove TIndex typedef from core/common.h (#12032)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12032

See title

Reviewed By: dinhviethoa

Differential Revision: D10023757

fbshipit-source-id: dbf0a043b2afab767f052bd4c5e8de13e0f57dcc
---
 caffe2/core/common.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 93bbf341b5061a..2582a605adee55 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -30,10 +30,6 @@
 
 namespace caffe2 {
 
-// Data type for caffe2 Index/Size. We use size_t to be safe here as well as for
-// large matrices that are common in sparse math.
-typedef int64_t TIndex;
-
 // Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
 // forcing us to use std::map instead of unordered_map. This may affect speed
 // in some cases, but in most of the computation code we do not access map very

From 0f81039eafc3ce6b81c07e9d3c16d8d68dd14b77 Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Wed, 26 Sep 2018 20:39:53 -0700
Subject: [PATCH 09/82] Better high level C++ documentation (#12079)

Summary:
I wrote some high level docs for the larger PyTorch C++ universe and the C++ frontend specifically. Happy for reviews, but let's please also land this ASAP so I can point users at something that looks more ready baked than the C++ docs landing page (https://pytorch.org/cppdocs) does right now.

ezyang soumith

CC ebetica
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12079

Differential Revision: D10080785

Pulled By: goldsborough

fbshipit-source-id: 3028de41373f307468eb1e3802aa27871c93b2e3
---
 docs/cpp/source/Doxyfile         |   4 +-
 docs/cpp/source/building.rst     |   2 -
 docs/cpp/source/contributing.rst |  10 +-
 docs/cpp/source/examples.rst     |   2 -
 docs/cpp/source/frontend.rst     | 146 +++++++++++++++++++++++++++
 docs/cpp/source/index.rst        | 164 ++++++++++++++++++++++++++++---
 docs/cpp/source/installing.rst   | 132 +++++++++++++++++++++++++
 7 files changed, 437 insertions(+), 23 deletions(-)
 delete mode 100644 docs/cpp/source/building.rst
 delete mode 100644 docs/cpp/source/examples.rst
 create mode 100644 docs/cpp/source/frontend.rst
 create mode 100644 docs/cpp/source/installing.rst

diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile
index 2ab4947453eaca..b600edee6c2498 100644
--- a/docs/cpp/source/Doxyfile
+++ b/docs/cpp/source/Doxyfile
@@ -66,6 +66,8 @@ CREATE_SUBDIRS         = NO
 FULL_PATH_NAMES        = YES
 # Nested folders will be ignored without this.
 RECURSIVE              = YES
+# Blacklist certain file patterns from the INPUT section.
+EXCLUDE = ../../../torch/csrc/api/include/torch/nn/pimpl-inl.h
 ################################################################################
 # Output formats for Doxygen to create.                                        #
 ################################################################################
@@ -102,7 +104,7 @@ EXTRACT_ALL            = YES
 EXTRACT_PACKAGE        = YES
 EXTRACT_STATIC         = YES
 CASE_SENSE_NAMES       = NO
-EXCLUDE_SYMBOLS        = c10::* caffe2::* cereal* DL* TH* cudnn*
+EXCLUDE_SYMBOLS        = c10::* caffe2::* cereal* DL* TH* cudnn* std::*
 ################################################################################
 # Docstring control / customization.                                           #
 ################################################################################
diff --git a/docs/cpp/source/building.rst b/docs/cpp/source/building.rst
deleted file mode 100644
index 24ab7a5e69ba3f..00000000000000
--- a/docs/cpp/source/building.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Building
-========
diff --git a/docs/cpp/source/contributing.rst b/docs/cpp/source/contributing.rst
index 5a1988f1db7c62..14ae9224d734ea 100644
--- a/docs/cpp/source/contributing.rst
+++ b/docs/cpp/source/contributing.rst
@@ -1,2 +1,8 @@
-Contributing
-============
+Contributing to PyTorch
+=======================
+
+If you would like to contribute to the PyTorch C++ API, refer to the
+`CONTRIBUTING.md
+<https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md>`_ document in
+the PyTorch repository.  It contains instructions on how to develop PyTorch from source
+and submit a proposal for your patch or feature. We will be happy to review it!
diff --git a/docs/cpp/source/examples.rst b/docs/cpp/source/examples.rst
deleted file mode 100644
index bac945d559fec7..00000000000000
--- a/docs/cpp/source/examples.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Examples
-========
diff --git a/docs/cpp/source/frontend.rst b/docs/cpp/source/frontend.rst
new file mode 100644
index 00000000000000..0a9a9943c6cbcd
--- /dev/null
+++ b/docs/cpp/source/frontend.rst
@@ -0,0 +1,146 @@
+The PyTorch C++ Frontend
+========================
+
+The PyTorch C++ frontend is a C++11 library for CPU and GPU
+tensor computation, with automatic differentation and high level building
+blocks for state of the art machine learning applications.
+
+Description
+-----------
+
+The PyTorch C++ frontend can be thought of as a C++ version of the
+PyTorch Python frontend, providing automatic differentiation and various higher
+level abstractions for machine learning and neural networks.  Specifically,
+it consists of the following components:
+
++----------------------+------------------------------------------------------------------------+
+| Component            | Description                                                            |
++======================+========================================================================+
+| ``torch::Tensor``    | Automatically differentiable, efficient CPU and GPU enabled tensors    |
++----------------------+------------------------------------------------------------------------+
+| ``torch::nn``        | A collection of composable modules for neural network modeling         |
++----------------------+------------------------------------------------------------------------+
+| ``torch::optim``     | Optimization algorithms like SGD, Adam or RMSprop to train your models |
++----------------------+------------------------------------------------------------------------+
+| ``torch::data``      | Datasets, data pipelines and multi-threaded, asynchronous data loader  |
++----------------------+------------------------------------------------------------------------+
+| ``torch::serialize`` | A serialization API for storing and loading model checkpoints          |
++----------------------+------------------------------------------------------------------------+
+| ``torch::python``    | Glue to bind your C++ models into Python                               |
++----------------------+------------------------------------------------------------------------+
+| ``torch::jit``       | Pure C++ access to the TorchScript JIT compiler                        |
++----------------------+------------------------------------------------------------------------+
+
+End-to-end example
+------------------
+
+Here is a simple, end-to-end example of defining and training a simple
+neural network on the MNIST dataset:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+
+  // Define a new Module.
+  struct Net : torch::nn::Module {
+    Net() {
+      // Construct and register two Linear submodules.
+      fc1 = register_module("fc1", torch::nn::Linear(8, 64));
+      fc2 = register_module("fc2", torch::nn::Linear(64, 1));
+    }
+
+    // Implement the Net's algorithm.
+    torch::Tensor forward(torch::Tensor x) {
+      // Use one of many tensor manipulation functions.
+      x = torch::relu(fc1->forward(x));
+      x = torch::dropout(x, /*p=*/0.5);
+      x = torch::sigmoid(fc2->forward(x));
+      return x;
+    }
+
+    // Use one of many "standard library" modules.
+    torch::nn::Linear fc1{nullptr}, fc2{nullptr};
+  };
+
+  // Create a new Net.
+  Net net;
+
+  // Create a multi-threaded data loader for the MNIST dataset.
+  auto data_loader =
+    torch::data::data_loader(torch::data::datasets::MNIST("./data"));
+
+  // Instantiate an SGD optimization algorithm to update our Net's parameters.
+  torch::optim::SGD optimizer(net.parameters(), /*lr=*/0.1);
+
+  for (size_t epoch = 1; epoch <= 10; ++epoch) {
+    size_t batch_index = 0;
+    // Iterate the data loader to yield batches from the dataset.
+    for (auto batch : data_loader) {
+      // Reset gradients.
+      optimizer.zero_grad();
+      // Execute the model on the input data.
+      auto prediction = model.forward(batch.data);
+      // Compute a loss value to judge the prediction of our model.
+      auto loss = torch::binary_cross_entropy(prediction, batch.label);
+      // Compute gradients of the loss w.r.t. the parameters of our model.
+      loss.backward();
+      // Update the parameters based on the calculated gradients.
+      optimizer.step();
+
+      if (batch_index++ % 10 == 0) {
+        std::cout << "Epoch: " << epoch << " | Batch: " << batch_index
+                  << " | Loss: " << loss << std::endl;
+        // Serialize your model periodically as a checkpoint.
+        torch::save(net, "net.pt");
+      }
+    }
+
+To see more complete examples of using the PyTorch C++ frontend, see `the example repository
+<https://github.com/goldsborough/examples/tree/cpp/cpp>`_.
+
+Philosophy
+----------
+
+PyTorch's C++ frontend was designed with the idea that the Python frontend is
+great, and should be used when possible; but in some settings, performance and
+portability requirements make the use of the Python interpreter infeasible. For
+example, Python is a poor choice for low latency, high performance or
+multithreaded environments, such as video games or production servers.  The
+goal of the C++ frontend is to address these use cases, while not sacrificing
+the user experience of the Python frontend.
+
+As such, the C++ frontend has been written with a few philosophical goals in mind:
+
+* **Closely model the Python frontend in its design**, naming, conventions and
+  functionality.  While there may be occasional differences between the two
+  frontends (e.g., where we have dropped deprecated features or fixed "warts"
+  in the Python frontend), we guarantee that the effort in porting a Python model
+  to C++ should lie exclusively in **translating language features**,
+  not modifying functionality or behavior.
+
+* **Prioritize flexibility and user-friendliness over micro-optimization.**
+  In C++, you can often get optimal code, but at the cost of an extremely
+  unfriendly user experience.  Flexibility and dynamism is at the heart of
+  PyTorch, and the C++ frontend seeks to preserve this experience, in some
+  cases sacrificing performance (or "hiding" performance knobs) to keep APIs
+  simple and explicable.  We want researchers who don't write C++ for a living
+  to be able to use our APIs.
+
+A word of warning: Python is not necessarily slower than
+C++! The Python frontend calls into C++ for almost anything computationally expensive
+(especially any kind of numeric operation), and these operations will take up
+the bulk of time spent in a program.  If you would prefer to write Python,
+and can afford to write Python, we recommend using the Python interface to
+PyTorch. However, if you would prefer to write C++, or need to write C++
+(because of multithreading, latency or deployment requirements), the
+C++ frontend to PyTorch provides an API that is approximately as convenient,
+flexible, friendly and intuitive as its Python counterpart. The two frontends
+serve different use cases, work hand in hand, and neither is meant to
+unconditionally replace the other.
+
+Installation
+------------
+
+Instructions on how to install the C++ frontend library distribution, including
+an example for how to build a minimal application depending on LibTorch, may be
+found by following `this <https://pytorch.org/cppdocs/installation.html>`_ link.
diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index 2743c3ea650b4e..5fef739c975518 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -1,36 +1,168 @@
 PyTorch C++ API
 ===============
 
-The PyTorch C++ API is a research and production ready C++ interface to PyTorch,
-a library for tensors and dynamic neural networks with strong GPU acceleration.
+These pages provide documentation for the public portions of the PyTorch C++
+API.  This API can roughly be divided into five parts:
 
-Description
+- **ATen**: The foundational tensor and mathematical operation library on which all else is built;
+- **Autograd**: Augments ATen with automatic differentiation;
+- **C++ Frontend**: High level constructs for training and evaluation of machine learning models;
+- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter;
+- **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines.
+
+Together, these building blocks form a research and
+production ready C++ library for tensor computation and dynamic neural
+networks with strong emphasis on GPU acceleration as well as fast CPU
+performance. It is currently in use at Facebook in research and
+production; we look forward to welcoming more users of the PyTorch C++ API.
+
+.. warning::
+
+  At the moment, the C++ API should be considered "beta" stability; we may
+  make major breaking changes to the backend in order to improve the API,
+  or in service of providing the Python interface to PyTorch, which is our
+  most stable and best supported interface.
+
+ATen
+----
+
+ATen is fundamentally a tensor library, on top of which almost all other Python
+and C++ interfaces in PyTorch are built. It provides a core ``Tensor`` class,
+on which many hundreds of operations are defined. Most of these operations have
+both CPU and GPU implementations, to which the ``Tensor`` class will
+dynamically dispatch based on its type. A small example of using ATen could
+look as follows:
+
+.. code-block:: cpp
+
+  #include <ATen/Aten.h>
+
+  at::Tensor a = at::ones({2, 2}, at::kInt);
+  at::Tensor b = at::randn({2, 2});
+  auto c = a + b.to(at::kInt);
+
+This ``Tensor`` class and all other symbols in ATen are found in the `at::`
+namespace, documented
+`here <https://pytorch.org/cppdocs/api/namespace_at.html#namespace-at>`_.
+
+Autograd
+--------
+
+What we term *autograd* are the portions of PyTorch's C++ API that augment the
+ATen ``Tensor`` class with capabilities concerning automatic differentiation.
+The autograd system records operations on tensors to form an *autograd graph*.
+Calling ``backwards()`` on a leaf variable in this graph performs reverse mode
+differentiation through the network of functions and tensors spanning the
+autograd graph, ultimately yieldings gradients. The following example provides
+a taste of this interface:
+
+.. code-block:: cpp
+
+  #include <torch/csrc/autograd/variable.h>
+  #include <torch/csrc/autograd/function.h>
+
+  at::Tensor a = torch::ones({2, 2}, at::requires_grad());
+  at::Tensor b = torch::randn({2, 2});
+  auto c = a + b;
+  c.backward(); // a.grad() will now hold the gradient of c w.r.t. a.
+
+The ``at::Tensor`` class in ATen is not differentiable by default. To add the
+differentiability of tensors the autograd API provides, you must use tensor
+factory functions from the `torch::` namespace instead of the `at` namespace.
+For example, while a tensor created with `at::ones` will not be differentiable,
+a tensor created with `torch::ones` will be.
+
+C++ Frontend
+------------
+
+The PyTorch C++ frontend provides a high level, pure C++ modeling interface for
+neural network and general machine learning research and production use cases,
+largely following the Python API in design and provided functionality. The C++
+frontend includes the following:
+
+- An interface for defining machine learning models through a hierarchical module system (like ``torch.nn.Module``);
+- A "standard library" of pre-existing modules for the most common modeling purposes (e.g. convolutions, RNNs, batch normalization etc.);
+- An optimization API, including implementations of popular optimizers such as SGD, Adam, RMSprop and others;
+- A means of representing datasets and data pipelines, including functionality to load data in parallel over many CPU cores;
+- A serialization format for storing and loading checkpoints of a training session (like ``torch.utils.data.DataLoader``);
+- Automatic parallelization of models onto multiple GPUs (like ``torch.nn.parallel.DataParallel``);
+- Support code to easily bind C++ models into Python using pybind11;
+- Entry points to the TorchScript JIT compiler;
+- Helpful utilities to facilitate interfacing with the ATen and Autograd APIs.
+
+See `this <https://pytorch.org/cppdocs/frontend.html>`_ document for a more
+detailed description of the C++ frontend. Relevant sections of the `torch::`
+namespace related to the C++ Frontend include `torch::nn
+<https://pytorch.org/cppdocs/api/namespace_torch**nn.html#namespace-torch-nn>`_,
+`torch::optim
+<https://pytorch.org/cppdocs/api/namespace_torch**optim.html#namespace-torch-optim>`_,
+`torch::data
+<https://pytorch.org/cppdocs/api/namespace_torch**data.html#namespace-torch-data>`_,
+`torch::serialize
+<https://pytorch.org/cppdocs/api/namespace_torch**serialize.html#namespace-torch-serialize>`_,
+`torch::jit
+<https://pytorch.org/cppdocs/api/namespace_torch**jit.html#namespace-torch-jit>`_
+and `torch::python
+<https://pytorch.org/cppdocs/api/namespace_torch**python.html#namespace-torch-python>`_.
+Examples of the C++ frontend can be found in `this repository
+<https://github.com/goldsborough/examples/tree/cpp/cpp>`_ which is being
+expanded on a continuous and active basis.
+
+.. note::
+
+  Unless you have a particular reason to constrain yourself exclusively to ATen
+  or the Autograd API, the C++ frontend is the recommended entry point to the
+  PyTorch C++ ecosystem. While it is still in beta as we collect user feedback
+  (from you!), it provides both more functionality and better stability
+  guarantees than the ATen and Autograd APIs.
+
+TorchScript
 -----------
 
-The PyTorch C++ API provides all the major building blocks to research and iterate on
-state of the art machine learning models with a user friendly modern C++ interface,
-as well as providing an excellent platform for deploying machine learning applications
-in bare bones, high performance environments.
+TorchScript a representation of a PyTorch model that can be understood,
+compiled and serialized by the TorchScript compiler. Fundamentally, TorchScript
+is a programming language in its own right. It is a subset of Python using
+the PyTorch API.  The C++ interface to TorchScript encompasses three primary pieces of
+functionality:
 
-1. Design Philosophy
-2. Description of components
-3. One small example
+- A mechanism for loading and executing serialized TorchScript models defined in Python;
+- An API for defining custom operators that extend the TorchScript standard library of operations;
+- Just-in-time compilation of TorchScript programs from C++.
 
-License
--------
+The first mechanism may be of great interest to you if you would like to define
+your models in Python as much as possible, but subsequently export them to C++
+for production environments and no-Python inference. You can find out more
+about this by following `this
+<https://pytorch.org/tutorials/advanced/cpp_export.html>`_ link. The second
+API concerns itself with scenarios in which you would like to extend
+TorchScript with custom operators, which can similarly be serialized and
+invoked from C++ during inference. Lastly, the `torch::jit::compile
+<https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a176d99fd5bf0233119a5f49c07a1d01d.html#exhale-function-namespacetorch-1-1jit-1a176d99fd5bf0233119a5f49c07a1d01d>`_
+function may be used to access the TorchScript compiler directly from C++.
 
+C++ Extensions
+--------------
 
+*C++ Extensions* offer a simple yet powerful way of accessing all of the above
+interfaces for the purpose of extending regular Python use-cases of PyTorch.
+C++ extensions are most commonly used to implement custom operators in C++ or
+CUDA to accelerate research in vanilla PyTorch setups. The C++ extension API
+does not add any new functionality to the PyTorch C++ API. Instead, it
+provides integration with Python setuptools as well as JIT compilation
+mechanisms that allow access to ATen, the autograd and other C++ APIs from
+Python. To learn more about the C++ extension API, see
+`this <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_ tutorial.
 
 Contents
-========
+--------
 
 .. toctree::
    :maxdepth: 2
 
-   api/library_root
-   examples
-   building
+   frontend
+   installing
    contributing
+   api/library_root
 
 
 Indices and tables
diff --git a/docs/cpp/source/installing.rst b/docs/cpp/source/installing.rst
new file mode 100644
index 00000000000000..d304f8168838f0
--- /dev/null
+++ b/docs/cpp/source/installing.rst
@@ -0,0 +1,132 @@
+Installing C++ Distributions of PyTorch
+=======================================
+
+We provide binary distributions of all headers, libraries and CMake
+configuration files required to depend on PyTorch. We call this distribution
+*LibTorch*, and you can download ZIP archives containing the latest LibTorch
+distribution on `our website <https://pytorch.org/get-started/locally/>`_. Below
+is a small example of writing a minimal application that depends on LibTorch
+and uses the `at::Tensor` class which comes with the PyTorch C++ API.
+
+Minimal Example
+---------------
+
+The first step is to download the LibTorch ZIP archive via the link above. For
+example:
+
+.. code-block:: sh
+
+  wget http://pytorch.org/libtorch/libtorch-latest.zip
+  unzip libtorch-latest.zip
+  ls -1R libtorch-latest
+
+
+Next, we can write a minimal CMake build configuration to develop a small
+application that depends on LibTorch. CMake is not a hard requirement for using
+LibTorch, but it is the recommended and blessed build system and will be well
+supported into the future. A most basic `CMakeLists.txt` file could look like
+this:
+
+.. code-block:: cmake
+
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  project(example-app)
+
+  find_package(Torch REQUIRED)
+
+  add_executable(example-app example-app.cpp)
+  target_link_libraries(example-app "${TORCH_LIBRARIES}")
+  set_property(TARGET example-app PROPERTY CXX_STANDARD 11)
+
+The implementation of our example will simply create a new `at::Tensor` and
+print it:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+  #include <iostream>
+
+  int main() {
+    at::Tensor tensor = torch::rand({2, 3});
+    std::cout << tensor << std::endl;
+  }
+
+While there are more fine-grained headers you can include to access only parts
+of the PyTorch C++ API, including `torch/torch.h` is the most sure-proof way of
+including most of its functionality.
+
+The last step is to build the application. For this, assume our example
+directory is laid out like this:
+
+.. code-block:: sh
+
+  example-app/
+    CMakeLists.txt
+    example-app.cpp
+
+We can now run the following commands to build the application from within the
+``example-app/`` folder:
+
+.. code-block:: sh
+
+  mkdir build
+  cd build
+  cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  make
+
+where ``/path/to/libtorch`` should be the full path to the unzipped LibTorch
+distribution. If all goes well, it will look something like this:
+
+.. code-block:: sh
+
+  root@4b5a67132e81:/example-app# mkdir build
+  root@4b5a67132e81:/example-app# cd build
+  root@4b5a67132e81:/example-app/build# cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  -- The C compiler identification is GNU 5.4.0
+  -- The CXX compiler identification is GNU 5.4.0
+  -- Check for working C compiler: /usr/bin/cc
+  -- Check for working C compiler: /usr/bin/cc -- works
+  -- Detecting C compiler ABI info
+  -- Detecting C compiler ABI info - done
+  -- Detecting C compile features
+  -- Detecting C compile features - done
+  -- Check for working CXX compiler: /usr/bin/c++
+  -- Check for working CXX compiler: /usr/bin/c++ -- works
+  -- Detecting CXX compiler ABI info
+  -- Detecting CXX compiler ABI info - done
+  -- Detecting CXX compile features
+  -- Detecting CXX compile features - done
+  -- Looking for pthread.h
+  -- Looking for pthread.h - found
+  -- Looking for pthread_create
+  -- Looking for pthread_create - not found
+  -- Looking for pthread_create in pthreads
+  -- Looking for pthread_create in pthreads - not found
+  -- Looking for pthread_create in pthread
+  -- Looking for pthread_create in pthread - found
+  -- Found Threads: TRUE
+  -- Configuring done
+  -- Generating done
+  -- Build files have been written to: /example-app/build
+  root@4b5a67132e81:/example-app/build# make
+  Scanning dependencies of target example-app
+  [ 50%] Building CXX object CMakeFiles/example-app.dir/example-app.cpp.o
+  [100%] Linking CXX executable example-app
+  [100%] Built target example-app
+
+Executing the resulting ``example-app`` binary found in the ``build`` folder
+should now merrily print the tensor (exact output subject to randomness):
+
+.. code-block:: sh
+
+  root@4b5a67132e81:/example-app/build# ./example-app model.pt
+  0.2063  0.6593  0.0866
+  0.0796  0.5841  0.1569
+  [ Variable[CPUFloatType]{2,3} ]
+
+Support
+-------
+
+If you run into any troubles with this installation and minimal usage guide,
+please use our `forum <https://discuss.pytorch.org/>`_ or `GitHub issues
+<https://github.com/pytorch/pytorch/issues>`_ to get in touch.

From 325101263ae8e240f9aeed50d9c9ca9ff510a574 Mon Sep 17 00:00:00 2001
From: zrphercule <zrphercule@gmail.com>
Date: Wed, 26 Sep 2018 20:47:51 -0700
Subject: [PATCH 10/82] Aten: catch2gtest (#11846)

Summary:
migrant all tests in aten to use gtest except of basic.cpp
Sinc features of gtest are different from catch test, some of the tests has been re-writted with similar meaning.

Basic test has some version conflict with valgrind according to CI, therefore this testcase is still implementing catch.
It will be resolved by a different pr.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/11846

Differential Revision: D10080860

Pulled By: zrphercule

fbshipit-source-id: 439d4cf33fb6ccbe79b797860342853c63e59081
---
 aten/src/ATen/test/apply_test.cpp             | 202 +++++-----
 aten/src/ATen/test/apply_utils_test.cpp       |  21 +-
 aten/src/ATen/test/atest.cpp                  |  86 +++--
 aten/src/ATen/test/basic.cpp                  | 143 +++----
 aten/src/ATen/test/broadcast_test.cpp         | 326 ++++++++--------
 aten/src/ATen/test/catch_utils.hpp            |   8 +-
 aten/src/ATen/test/cuda_half_test.cu          |  36 +-
 aten/src/ATen/test/cuda_optional_test.cu      |  15 +-
 .../test/cuda_packedtensoraccessor_test.cu    |  20 +-
 aten/src/ATen/test/cuda_rng_test.cpp          |   8 +-
 aten/src/ATen/test/cudnn_test.cpp             |  16 +-
 aten/src/ATen/test/dlconvertor_test.cpp       |  14 +-
 aten/src/ATen/test/half_test.cpp              | 141 +++----
 aten/src/ATen/test/integer_divider_test.cu    |  97 +++--
 aten/src/ATen/test/native_test.cpp            | 348 ++++++++++--------
 aten/src/ATen/test/scalar_tensor_test.cpp     | 288 ++++++++-------
 aten/src/ATen/test/scalar_test.cpp            |  87 ++---
 aten/src/ATen/test/stream_test.cpp            | 179 +++++----
 aten/src/ATen/test/test_parallel.cpp          |  10 +-
 aten/src/ATen/test/undefined_tensor_test.cpp  |  49 ++-
 aten/src/ATen/test/weakref_test.cpp           |  93 ++---
 aten/src/ATen/test/wrapdim_test.cpp           |  70 ++--
 22 files changed, 1194 insertions(+), 1063 deletions(-)

diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp
index fc39eccee3926b..93a2d705bd8a08 100644
--- a/aten/src/ATen/test/apply_test.cpp
+++ b/aten/src/ATen/test/apply_test.cpp
@@ -1,121 +1,135 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "cuda.h"
 #include "cuda_runtime.h"
 
 #include "ATen/cuda/detail/TensorInfo.cuh"
-
+#define ASSERT_EQ_CUDA(X, Y) \
+  {                          \
+    bool _isEQ = X == Y;     \
+    ASSERT_TRUE(_isEQ);      \
+  }
 /*
-Tests related to tensor indexing and applying operations. 
+   Tests related to tensor indexing and applying operations.
 */
 #ifndef _WIN32
 
-CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") {
-    int sizes[] = {4, 4};
-    int strides[] = {4, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == (4 * 4));
+// CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D
+// contiguous") {
+TEST(ApplyTest, Contiguous2D) {
+  int sizes[] = {4, 4};
+  int strides[] = {4, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], (4 * 4));
 }
 
-CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") {
-    int sizes[] = {6, 3, 7};
-    int strides[] = {3 * 7, 7, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == (6 * 3 * 7));
+// CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D
+// contiguous") {
+TEST(ApplyTest, Contiguous3D) {
+  int sizes[] = {6, 3, 7};
+  int strides[] = {3 * 7, 7, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], (6 * 3 * 7));
 }
-
-CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") {
-    int sizes[] = {4, 3, 2};
-    int strides[] = {3 * 3, 3, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 2);
-    CATCH_REQUIRE(ti.sizes[0] == (4 * 3));
-    CATCH_REQUIRE(ti.sizes[1] == 2);
+// CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor
+// to a 2D tensor") {
+TEST(ApplyTest, PartialCollapse3D) {
+  int sizes[] = {4, 3, 2};
+  int strides[] = {3 * 3, 3, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 2);
+  ASSERT_EQ_CUDA(ti.sizes[0], (4 * 3));
+  ASSERT_EQ_CUDA(ti.sizes[1], 2);
 }
 
-CATCH_TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") {
-    int sizes[] = {3, 2};
-    int strides[] = {2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == (3 * 2));
-    CATCH_REQUIRE(ti.strides[0] == 2);
+// Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor
+TEST(ApplyTest, StridedCollapse2D) {
+  int sizes[] = {3, 2};
+  int strides[] = {2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], (3 * 2));
+  ASSERT_EQ_CUDA(ti.strides[0], 2);
 }
 
-CATCH_TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){
-    int sizes[] = {3, 6, 5, 2};
-    int strides[] = {6 * 22, 22, 2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 2);
-    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
-    CATCH_REQUIRE(ti.strides[0] == 22);
-    CATCH_REQUIRE(ti.sizes[1] == (5 * 2));
-    CATCH_REQUIRE(ti.strides[1] == 2);
+// Collapses a 4D tensor to a 2D tensor
+TEST(ApplyTest, PartialStridedCollapse4D) {
+  int sizes[] = {3, 6, 5, 2};
+  int strides[] = {6 * 22, 22, 2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 2);
+  ASSERT_EQ_CUDA(ti.sizes[0], (3 * 6));
+  ASSERT_EQ_CUDA(ti.strides[0], 22);
+  ASSERT_EQ_CUDA(ti.sizes[1], (5 * 2));
+  ASSERT_EQ_CUDA(ti.strides[1], 2);
 }
 
-CATCH_TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") {
-    int sizes[] = {1, 10, 1, 5, 4};
-    int strides[] = {4, 0, 16, 0, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 5, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 2);
-    CATCH_REQUIRE(ti.sizes[0] == (10 * 5));
-    CATCH_REQUIRE(ti.strides[0] == 0);
-    CATCH_REQUIRE(ti.sizes[1] == 4);
-    CATCH_REQUIRE(ti.strides[1] == 1);
+// Collapses a 5D tensor to a 1D tensor
+TEST(ApplyTest, CollapsesZerosAndOnes) {
+  int sizes[] = {1, 10, 1, 5, 4};
+  int strides[] = {4, 0, 16, 0, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 5, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 2);
+  ASSERT_EQ_CUDA(ti.sizes[0], (10 * 5));
+  ASSERT_EQ_CUDA(ti.strides[0], 0);
+  ASSERT_EQ_CUDA(ti.sizes[1], 4);
+  ASSERT_EQ_CUDA(ti.strides[1], 1);
 }
 
-CATCH_TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") {
-    int sizes[] = {1, 1, 1};
-    int strides[] = {17, 12, 3};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    CATCH_REQUIRE(ti.collapseDims() == 0);
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == 1);
-    CATCH_REQUIRE(ti.strides[0] == 1);
+// Collapses a 3D tensor to a point tensor
+TEST(ApplyTest, CollapseToPointTensor) {
+  int sizes[] = {1, 1, 1};
+  int strides[] = {17, 12, 3};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ASSERT_EQ_CUDA(ti.collapseDims(), 0);
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], 1);
+  ASSERT_EQ_CUDA(ti.strides[0], 1);
 }
 
-CATCH_TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") {
-    int sizes[] = {3, 6, 5, 2};
-    int strides[] = {6 * 22, 22, 2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    CATCH_REQUIRE(ti.collapseDims(1) == 1);
-    CATCH_REQUIRE(ti.dims == 3);
-    CATCH_REQUIRE(ti.sizes[0] == 3);
-    CATCH_REQUIRE(ti.strides[0] == (6 * 22));
-    CATCH_REQUIRE(ti.sizes[1] == 6);
-    CATCH_REQUIRE(ti.strides[1] == 22);
-    CATCH_REQUIRE(ti.sizes[2] == (5 * 2));
-    CATCH_REQUIRE(ti.strides[2] == 2);
+// Collapses a 4D tensor to a 3D tensor
+TEST(ApplyTest, ExcludingInContiguous4D) {
+  int sizes[] = {3, 6, 5, 2};
+  int strides[] = {6 * 22, 22, 2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+  ASSERT_EQ_CUDA(ti.collapseDims(1), 1);
+  ASSERT_EQ_CUDA(ti.dims, 3);
+  ASSERT_EQ_CUDA(ti.sizes[0], 3);
+  ASSERT_EQ_CUDA(ti.strides[0], (6 * 22));
+  ASSERT_EQ_CUDA(ti.sizes[1], 6);
+  ASSERT_EQ_CUDA(ti.strides[1], 22);
+  ASSERT_EQ_CUDA(ti.sizes[2], (5 * 2));
+  ASSERT_EQ_CUDA(ti.strides[2], 2);
 }
 
-CATCH_TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") {
-    int sizes[] = {3, 6, 5, 2};
-    int strides[] = {6 * 22, 22, 2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    CATCH_REQUIRE(ti.collapseDims(2) == 1);
-    CATCH_REQUIRE(ti.dims == 3);
-    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
-    CATCH_REQUIRE(ti.strides[0] == 22);
-    CATCH_REQUIRE(ti.sizes[1] == 5);
-    CATCH_REQUIRE(ti.strides[1] == 4);
-    CATCH_REQUIRE(ti.sizes[2] == 2);
-    CATCH_REQUIRE(ti.strides[2] == 2);
+// Collapses a 4D tensor to a 3D tensor
+TEST(ApplyTest, RovingExclusion) {
+  int sizes[] = {3, 6, 5, 2};
+  int strides[] = {6 * 22, 22, 2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+  ASSERT_EQ_CUDA(ti.collapseDims(2), 1);
+  ASSERT_EQ_CUDA(ti.dims, 3);
+  ASSERT_EQ_CUDA(ti.sizes[0], (3 * 6));
+  ASSERT_EQ_CUDA(ti.strides[0], 22);
+  ASSERT_EQ_CUDA(ti.sizes[1], 5);
+  ASSERT_EQ_CUDA(ti.strides[1], 4);
+  ASSERT_EQ_CUDA(ti.sizes[2], 2);
+  ASSERT_EQ_CUDA(ti.strides[2], 2);
 }
 
-CATCH_TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") {
-    int sizes[] = {1, 1, 1};
-    int strides[] = {17, 12, 3};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    _CATCH_REQUIRE_THROWS(ti.collapseDims(5));
-} 
-
+// Attempts to exclude a nonexisting dimension
+TEST(ApplyTest, InvalidExclusion) {
+  int sizes[] = {1, 1, 1};
+  int strides[] = {17, 12, 3};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ASSERT_ANY_THROW(ti.collapseDims(5));
+}
 #endif
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
index ab7e3522bbedae..71715a2d4b0d6e 100644
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/CPUApplyUtils.h"
@@ -108,32 +107,38 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
   });
 }
 
-CATCH_TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
+// apply utils test 2-dim small contiguous
+TEST(ApplyUtilsTest, Contiguous2D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1}, -1, -1);
 }
 
-CATCH_TEST_CASE("apply utils test 2-dim small", "[cpu]") {
+// apply utils test 2-dim small
+TEST(ApplyUtilsTest, Small2D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1});
 }
 
-CATCH_TEST_CASE("apply utils test 2-dim", "[cpu]") {
+// apply utils test 2-dim
+TEST(ApplyUtilsTest, _2D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {20, 10});
 }
 
-CATCH_TEST_CASE("apply utils test 3-dim", "[cpu]") {
+// apply utils test 3-dim
+TEST(ApplyUtilsTest, _3D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2});
 }
 
-CATCH_TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
+// apply utils test 3-dim medium
+TEST(ApplyUtilsTest, Medium3D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 40, 2});
 }
 
-CATCH_TEST_CASE("apply utils test 10-dim", "[cpu]") {
+// apply utils test 10-dim
+TEST(ApplyUtilsTest, _10D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3});
 }
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
index edb3f79fd2d55d..96c5ed11897481 100644
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@@ -8,17 +8,17 @@ using namespace std;
 using namespace at;
 
 void trace() {
-  Tensor foo = rand({12,12});
+  Tensor foo = rand({12, 12});
 
   // ASSERT foo is 2-dimensional and holds floats.
-  auto foo_a = foo.accessor<float,2>();
+  auto foo_a = foo.accessor<float, 2>();
   float trace = 0;
 
-  for(int i = 0; i < foo_a.size(0); i++) {
+  for (int i = 0; i < foo_a.size(0); i++) {
     trace += foo_a[i][i];
   }
 
-  EXPECT_FLOAT_EQ(foo.trace().item<float>(), trace);
+  ASSERT_FLOAT_EQ(foo.trace().item<float>(), trace);
 }
 
 // TEST_CASE( "atest", "[]" ) {
@@ -26,82 +26,78 @@ TEST(atest, atest) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
-  auto foo = rand({12,6});
+  auto foo = rand({12, 6});
 
-  EXPECT_EQ(foo.size(0), 12);
-  EXPECT_EQ(foo.size(1), 6);
+  ASSERT_EQ(foo.size(0), 12);
+  ASSERT_EQ(foo.size(1), 6);
 
-  foo = foo+foo*3;
+  foo = foo + foo * 3;
   foo -= 4;
 
   Scalar a = 4;
   float b = a.to<float>();
-  EXPECT_EQ(b, 4);
+  ASSERT_EQ(b, 4);
 
-  foo = (foo*foo) == (foo.pow(3));
-  foo =  2 + (foo+1);
-  //foo = foo[3];
-  auto foo_v = foo.accessor<uint8_t,2>();
+  foo = (foo * foo) == (foo.pow(3));
+  foo = 2 + (foo + 1);
+  // foo = foo[3];
+  auto foo_v = foo.accessor<uint8_t, 2>();
 
-  for(int i = 0; i < foo_v.size(0); i++) {
-    for(int j = 0; j < foo_v.size(1); j++) {
+  for (int i = 0; i < foo_v.size(0); i++) {
+    for (int j = 0; j < foo_v.size(1); j++) {
       foo_v[i][j]++;
     }
   }
 
-  EXPECT_TRUE(foo.equal(4 * ones({12, 6}, kByte)));
+  ASSERT_TRUE(foo.equal(4 * ones({12, 6}, kByte)));
 
   trace();
 
-  float data[] = { 1, 2, 3,
-                   4, 5, 6};
+  float data[] = {1, 2, 3, 4, 5, 6};
 
-  auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3});
-  auto f_a = f.accessor<float,3>();
+  auto f = CPU(kFloat).tensorFromBlob(data, {1, 2, 3});
+  auto f_a = f.accessor<float, 3>();
 
-  EXPECT_EQ(f_a[0][0][0], 1.0);
-  EXPECT_EQ(f_a[0][1][1], 5.0);
+  ASSERT_EQ(f_a[0][0][0], 1.0);
+  ASSERT_EQ(f_a[0][1][1], 5.0);
 
-  EXPECT_EQ(f.strides()[0], 6);
-  EXPECT_EQ(f.strides()[1], 3);
-  EXPECT_EQ(f.strides()[2], 1);
-  EXPECT_EQ(f.sizes()[0], 1);
-  EXPECT_EQ(f.sizes()[1], 2);
-  EXPECT_EQ(f.sizes()[2], 3);
+  ASSERT_EQ(f.strides()[0], 6);
+  ASSERT_EQ(f.strides()[1], 3);
+  ASSERT_EQ(f.strides()[2], 1);
+  ASSERT_EQ(f.sizes()[0], 1);
+  ASSERT_EQ(f.sizes()[1], 2);
+  ASSERT_EQ(f.sizes()[2], 3);
 
   // TODO(ezyang): maybe do a more precise exception type.
-  ASSERT_THROW(f.resize_({3,4,5}), std::exception);
+  ASSERT_THROW(f.resize_({3, 4, 5}), std::exception);
   {
     int isgone = 0;
     {
-      auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) {
-        isgone++;
-      });
+      auto f2 =
+          CPU(kFloat).tensorFromBlob(data, {1, 2, 3}, [&](void*) { isgone++; });
     }
-    EXPECT_EQ(isgone, 1);
+    ASSERT_EQ(isgone, 1);
   }
   {
     int isgone = 0;
     Tensor a_view;
     {
-      auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) {
-        isgone++;
-      });
-      a_view = f2.view({3,2,1});
+      auto f2 =
+          CPU(kFloat).tensorFromBlob(data, {1, 2, 3}, [&](void*) { isgone++; });
+      a_view = f2.view({3, 2, 1});
     }
-    EXPECT_EQ(isgone, 0);
+    ASSERT_EQ(isgone, 0);
     a_view.reset();
-    EXPECT_EQ(isgone, 1);
+    ASSERT_EQ(isgone, 1);
   }
 
-  if(at::hasCUDA()) {
+  if (at::hasCUDA()) {
     int isgone = 0;
     {
-      auto base = CUDA(kFloat).tensor({1,2,3});
-      auto f2 = CUDA(kFloat).tensorFromBlob(base.data_ptr(), {1,2,3}, [&](void*) {
-        isgone++;
-      });
+      auto base = CUDA(kFloat).tensor({1, 2, 3});
+      auto f2 = CUDA(kFloat).tensorFromBlob(
+          base.data_ptr(), {1, 2, 3}, [&](void*) { isgone++; });
     }
-    EXPECT_EQ(isgone, 1);
+    ASSERT_EQ(isgone, 1);
   }
 }
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 361d24b5a6b76f..791d80b1f42f95 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -19,36 +19,35 @@ using namespace at;
 
 using Catch::Matchers::StartsWith;
 
-static void test(Type & type) {
-  CATCH_SECTION( "resize" ) {
+static void test(Type& type) {
+  CATCH_SECTION("resize") {
     auto a = at::empty({0}, type.options());
-    a.resize_({3,4});
+    a.resize_({3, 4});
     CATCH_REQUIRE(a.numel() == 12);
     a.resize_({5, 7});
     CATCH_REQUIRE(a.numel() == 35);
-
   }
 
-  CATCH_SECTION( "ones and dot" ) {
+  CATCH_SECTION("ones and dot") {
     Tensor b0 = ones({1, 1}, type);
-    CATCH_REQUIRE(2 == (b0+b0).sum().item<double>());
+    CATCH_REQUIRE(2 == (b0 + b0).sum().item<double>());
 
     Tensor b1 = ones({1, 2}, type);
-    CATCH_REQUIRE(4 == (b1+b1).sum().item<double>());
+    CATCH_REQUIRE(4 == (b1 + b1).sum().item<double>());
 
     Tensor b = ones({3, 4}, type);
-    CATCH_REQUIRE(24 == (b+b).sum().item<double>());
+    CATCH_REQUIRE(24 == (b + b).sum().item<double>());
     CATCH_REQUIRE(12 == b.numel());
     CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).item<double>() == 12);
   }
 
-  CATCH_SECTION( "rand" ) {
-    for(auto i = 0; i < 10; i++) {
-      Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble));
+  CATCH_SECTION("rand") {
+    for (auto i = 0; i < 10; i++) {
+      Tensor a = rand({3, 4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble));
     }
   }
 
-  CATCH_SECTION( "sort" ) {
+  CATCH_SECTION("sort") {
     Tensor b = rand({3, 4}, type);
 
     auto z = b.sort(1);
@@ -57,93 +56,101 @@ static void test(Type & type) {
     CATCH_REQUIRE(z_sorted[0][0].item<float>() < z_sorted[0][1].item<float>());
   }
 
-  if(type.backend() != Backend::CUDA)
-  CATCH_SECTION( "randperm" ) {
-    Tensor b = randperm(15, type);
-    Tensor rv, ri;
-    std::tie(rv, ri) = sort(b, 0);
-    CATCH_REQUIRE(rv[0].item<float>() <= rv[1].item<float>());
-  }
+  if (type.backend() != Backend::CUDA)
+    CATCH_SECTION("randperm") {
+      Tensor b = randperm(15, type);
+      Tensor rv, ri;
+      std::tie(rv, ri) = sort(b, 0);
+      CATCH_REQUIRE(rv[0].item<float>() <= rv[1].item<float>());
+    }
 
-  CATCH_SECTION( "context" ) {
+  CATCH_SECTION("context") {
     std::stringstream ss;
     ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl;
   }
 
-  CATCH_SECTION( "add" ) {
+  CATCH_SECTION("add") {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({3, 4}, type);
     Tensor c = add(a, add(a, b));
-    //TODO:0-dim Tensor d(3.f);
+    // TODO:0-dim Tensor d(3.f);
     Scalar d = 3.f;
-    CATCH_REQUIRE( add(c, d).allclose(a + a + b + d) );
+    CATCH_REQUIRE(add(c, d).allclose(a + a + b + d));
   }
 
-  CATCH_SECTION( "loads of adds" ) {
+  CATCH_SECTION("loads of adds") {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
-    for(auto i = 0; i < 100000; i++) {
+    for (auto i = 0; i < 100000; i++) {
       add_out(r, r, d);
     }
     auto end = std::chrono::high_resolution_clock::now();
-    //TODO TEST PERF?
-    std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
+    // TODO TEST PERF?
+    std::cout << std::dec << "   "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(
+                     end - begin)
+                     .count()
+              << " ms" << std::endl;
+    CATCH_REQUIRE(norm(100000 * d).item<double>() == norm(r).item<double>());
   }
 
-  CATCH_SECTION( "loads of adds (with copy)" ) {
+  CATCH_SECTION("loads of adds (with copy)") {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
-    for(auto i = 0; i < 100000; i++) {
+    for (auto i = 0; i < 100000; i++) {
       r = add(r, d);
     }
     auto end = std::chrono::high_resolution_clock::now();
-    //TODO TEST PERF?
-    std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
+    // TODO TEST PERF?
+    std::cout << std::dec << "   "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(
+                     end - begin)
+                     .count()
+              << " ms" << std::endl;
+    CATCH_REQUIRE(norm(100000 * d).item<double>() == norm(r).item<double>());
   }
 
-  CATCH_SECTION( "isContiguous" ) {
+  CATCH_SECTION("isContiguous") {
     Tensor a = rand({3, 4}, type);
     CATCH_REQUIRE(a.is_contiguous());
     a = a.transpose(0, 1);
     CATCH_REQUIRE(!a.is_contiguous());
   }
 
-  CATCH_SECTION( "permute" ) {
+  CATCH_SECTION("permute") {
     Tensor a = rand({3, 4, 5}, type);
     Tensor b = a.permute({1, 2, 0});
     CATCH_REQUIRE(b.sizes().equals({4, 5, 3}));
     CATCH_REQUIRE(b.strides().equals({5, 1, 20}));
   }
 
-  CATCH_SECTION( "mm" ) {
+  CATCH_SECTION("mm") {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({4}, type);
     Tensor c = mv(a, b);
     CATCH_REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1)));
   }
 
-  CATCH_SECTION( "squeeze" ) {
+  CATCH_SECTION("squeeze") {
     Tensor a = rand({2, 1}, type);
     Tensor b = squeeze(a);
     CATCH_REQUIRE(b.dim() == 1);
     a = rand({1}, type);
     b = squeeze(a);
-    //TODO 0-dim squeeze
+    // TODO 0-dim squeeze
     CATCH_REQUIRE(a[0].equal(b));
   }
 
-  CATCH_SECTION( "copy" ) {
+  CATCH_SECTION("copy") {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({4, 3}, type);
     a.copy_(e);
     CATCH_REQUIRE(a.equal(e));
   }
 
-  CATCH_SECTION( "copy (broadcasting)" ) {
+  CATCH_SECTION("copy (broadcasting)") {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({3}, type);
     a.copy_(e);
@@ -152,12 +159,12 @@ static void test(Type & type) {
     }
   }
 
-  CATCH_SECTION( "abs(value)" ) {
+  CATCH_SECTION("abs(value)") {
     Tensor r = at::abs(type.scalarTensor(-3));
     CATCH_REQUIRE(r.item<int32_t>() == 3);
   }
 
-//TODO(zach): operator overloads
+// TODO(zach): operator overloads
 #if 0
   {
     std::cout << "eq (value):" << std::endl;
@@ -168,60 +175,60 @@ static void test(Type & type) {
   }
 #endif
 
-  CATCH_SECTION( "adding a value with a scalar" ) {
+  CATCH_SECTION("adding a value with a scalar") {
     Tensor a = rand({4, 3}, type);
-    CATCH_REQUIRE((ones({4,3}, type) + a).equal(add(a,1)));
+    CATCH_REQUIRE((ones({4, 3}, type) + a).equal(add(a, 1)));
   }
 
-  CATCH_SECTION( "select" ) {
+  CATCH_SECTION("select") {
     Tensor a = rand({3, 7}, type);
     auto a_13 = select(a, 1, 3);
     auto a_13_02 = select(select(a, 1, 3), 0, 2);
-    CATCH_REQUIRE( a[0][3].equal(a_13[0]) );
-    CATCH_REQUIRE( a[2][3].equal(a_13_02) );
+    CATCH_REQUIRE(a[0][3].equal(a_13[0]));
+    CATCH_REQUIRE(a[2][3].equal(a_13_02));
   }
 
-  CATCH_SECTION( "zero-dim" ) {
-    Tensor a =  type.scalarTensor(4); //rand(type, {1});
+  CATCH_SECTION("zero-dim") {
+    Tensor a = type.scalarTensor(4); // rand(type, {1});
 
-    Tensor b = rand({3,4}, type);
+    Tensor b = rand({3, 4}, type);
     CATCH_REQUIRE((a + a).dim() == 0);
     CATCH_REQUIRE((1 + a).dim() == 0);
     CATCH_REQUIRE((b + a).dim() == 2);
     CATCH_REQUIRE((a + b).dim() == 2);
-    auto c = rand({3,4}, type);
+    auto c = rand({3, 4}, type);
     CATCH_REQUIRE(c[1][2].dim() == 0);
 
-    auto f = rand({3,4}, type);
+    auto f = rand({3, 4}, type);
     f[2] = zeros({4}, type);
     f[1][0] = -1;
     CATCH_REQUIRE(f[2][0].item<double>() == 0);
   }
 
-  CATCH_SECTION( "tensor from TH" ) {
+  CATCH_SECTION("tensor from TH") {
     int a = 4;
-    THFloatTensor *t = THFloatTensor_newWithSize2d(a, a);
+    THFloatTensor* t = THFloatTensor_newWithSize2d(a, a);
     THFloatTensor_fill(t, a);
-    Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false);
+    Tensor tt = CPU(kFloat).unsafeTensorFromTH(t, false);
     CATCH_REQUIRE_NOTHROW(tt);
   }
 
-  CATCH_SECTION( "item<float>" ) {
-    Tensor a = zeros({3,4});
-    Tensor b = ones({3,7});
-    Tensor c = cat({a,b},1);
+  CATCH_SECTION("item<float>") {
+    Tensor a = zeros({3, 4});
+    Tensor b = ones({3, 7});
+    Tensor c = cat({a, b}, 1);
     CATCH_REQUIRE(c.size(1) == 11);
 
     Tensor e = rand({});
     CATCH_REQUIRE(*e.data<float>() == e.sum().item<float>());
   }
 
-  CATCH_SECTION( "to string" ) {
-    Tensor b = ones({3,7})*.0000001f;
+  CATCH_SECTION("to string") {
+    Tensor b = ones({3, 7}) * .0000001f;
     std::stringstream s;
     s << b << "\n";
     std::string expect = "1e-07 *";
-    CATCH_REQUIRE(s.str().substr(0,expect.size()) == expect);
+    CATCH_REQUIRE(s.str().substr(0, expect.size()) == expect);
   }
   CATCH_SECTION("indexing by Scalar") {
     Tensor tensor = arange(0, 10, kInt);
@@ -243,8 +250,7 @@ static void test(Type & type) {
     }
     CATCH_REQUIRE_THROWS_WITH(
         tensor[Scalar(3.14)].equal(one),
-        StartsWith(
-            "Can only index tensors with integral scalars"));
+        StartsWith("Can only index tensors with integral scalars"));
   }
   CATCH_SECTION("indexing by zero-dim tensor") {
     Tensor tensor = arange(0, 10, kInt);
@@ -254,8 +260,7 @@ static void test(Type & type) {
     }
     CATCH_REQUIRE_THROWS_WITH(
         tensor[ones({}) * 3.14].equal(one),
-        StartsWith(
-            "Can only index tensors with integral scalars"));
+        StartsWith("Can only index tensors with integral scalars"));
     CATCH_REQUIRE_THROWS_WITH(
         tensor[Tensor()].equal(one),
         StartsWith("Can only index with tensors that are defined"));
@@ -275,16 +280,16 @@ static void test(Type & type) {
   }
 }
 
-CATCH_TEST_CASE( "basic tests CPU", "[cpu]" ) {
+CATCH_TEST_CASE("basic tests CPU", "[cpu]") {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
-CATCH_TEST_CASE( "basic tests GPU", "[cuda]" ) {
+CATCH_TEST_CASE("basic tests GPU", "[cuda]") {
   manual_seed(123, at::kCUDA);
 
-  if(at::hasCUDA()) {
+  if (at::hasCUDA()) {
     test(CUDA(kFloat));
   }
 }
diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp
index 822a1d79df1bda..8bebb7d8fdd907 100644
--- a/aten/src/ATen/test/broadcast_test.cpp
+++ b/aten/src/ATen/test/broadcast_test.cpp
@@ -1,154 +1,192 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-CATCH_TEST_CASE( "broadcast", "[]" ) {
+// can't expand empty tensor
+void TestEmptyTensor(Type& T) {
+  auto empty = randn({0}, T);
+  ASSERT_ANY_THROW(empty.expand({3}));
+}
+
+// out-place function with 2 args
+void TestOut2Basic(Type& T) {
+  auto a = randn({3, 1}, T);
+  auto b = randn({5}, T);
+  std::vector<int64_t> expanded_sizes = {3, 5};
+  ASSERT_TRUE(
+      (a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
+}
+
+// with scalar
+void TestOut2WithScalar(Type& T) {
+  auto aScalar = ones({1}, T);
+  aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  auto b = randn({3, 5}, T);
+  ASSERT_TRUE(
+      (aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
+}
 
+// old fallback behavior yields error
+void TestOut2OldFallback(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto b = randn({5, 3}, T);
+  ASSERT_ANY_THROW(a + b);
+}
+
+// with mismatched sizes
+void TestOut2MismatchedSizes(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto b = randn({7, 5}, T);
+  ASSERT_ANY_THROW(a + b);
+}
+
+// out-place function with 3 args
+void TestOut3Basic(Type& T) {
+  auto a = randn({3, 1, 1}, T);
+  auto b = randn({1, 2, 1}, T);
+  auto c = randn({1, 1, 5}, T);
+  std::vector<int64_t> expanded_sizes = {3, 2, 5};
+  ASSERT_TRUE((a + b + c).equal(
+      a.expand(expanded_sizes) + b.expand(expanded_sizes) +
+      c.expand(expanded_sizes)));
+}
+
+// with scalar
+void TestOut3WithScalar(Type& T) {
+  auto aTensorScalar = ones({1}, T);
+  aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  auto b = randn({3, 2, 1}, T);
+  auto c = randn({1, 2, 5}, T);
+  std::vector<int64_t> expanded_sizes = {3, 2, 5};
+  ASSERT_TRUE(aTensorScalar.addcmul(b, c).equal(
+      aTensorScalar.expand(expanded_sizes)
+          .addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes))));
+}
+
+// old fallback behavior yields error
+void TestOut3OldFallback(Type& T) {
+  auto a = randn({3, 2, 5}, T);
+  auto b = randn({2, 3, 5}, T);
+  auto c = randn({5, 3, 2}, T);
+  ASSERT_ANY_THROW(a.addcmul(b, c));
+}
+
+// with mismatched sizes
+void TestOut3MismatchedSizes(Type& T) {
+  auto a = randn({3, 2, 5}, T);
+  auto b = randn({2, 3, 5}, T);
+  auto c = randn({5, 5, 5}, T);
+  ASSERT_ANY_THROW(a.addcmul(b, c));
+}
+
+// in-place function with 2 args
+void TestIn2Basic(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto b = randn({3, 1}, T);
+  ASSERT_TRUE((a + b).equal(a + b.expand({3, 5})));
+}
+
+// with scalar
+void TestIn2WithScalar(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto bScalar = ones({1}, T);
+  bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
+}
+
+// error: would have to expand inplace arg
+void TestIn2ExpandError(Type& T) {
+  auto a = randn({1, 5}, T);
+  auto b = randn({3, 1}, T);
+  ASSERT_ANY_THROW(a.add_(b));
+}
+
+// in-place function with 3 args
+void TestIn3Basic(Type& T) {
+  auto a = randn({3, 5, 2}, T);
+  auto b = randn({3, 1, 2}, T);
+  auto c = randn({1, 5, 1}, T);
+  auto aClone = a.clone();
+  ASSERT_TRUE(a.addcmul_(b, c).equal(
+      aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
+}
+
+// with scalar
+void TestIn3WithScalar(Type& T) {
+  auto a = randn({3, 5, 2}, T);
+  auto b = randn({3, 1, 2}, T);
+  auto c = randn({1, 5, 1}, T);
+  auto aClone = a.clone();
+  auto bScalar = ones({1}, T);
+  bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE(a.addcmul_(bScalar, c)
+                  .equal(aClone.addcmul_(
+                      bScalar.expand(a.sizes()), c.expand(a.sizes()))));
+}
+
+// error: would have to expand inplace arg
+void TestIn3ExpandError(Type& T) {
+  auto a = randn({1, 3, 5}, T);
+  auto b = randn({4, 1, 1}, T);
+  auto c = randn({1, 3, 1}, T);
+  ASSERT_ANY_THROW(a.addcmul_(b, c));
+}
+
+// explicit dim specification
+void TestExplicitDimBasic(Type& T) {
+  auto a = randn({1}, T);
+  auto b = randn({5, 3}, T);
+  auto c = randn({3, 7}, T);
+  ASSERT_TRUE(a.addmm(b, c).equal(a.expand({5, 7}).addmm(b, c)));
+}
+
+// with scalar
+void TestExplicitDimWithScalar(Type& T) {
+  auto a = randn({1}, T);
+  auto b = randn({5, 3}, T);
+  auto c = randn({3, 7}, T);
+  Tensor aScalar = ones({1}, T);
+  aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
+}
+
+// with mismatched sizes
+void TestExplicitDimWithMismatchedSizes(Type& T) {
+  auto b = randn({5, 3}, T);
+  auto c = randn({3, 7}, T);
+  auto a = randn({3, 3}, T);
+  ASSERT_ANY_THROW(a.addmm(b, c));
+}
+
+TEST(BroadcastTest, Broadcast) {
   manual_seed(123, at::kCPU);
+  Type& T = CPU(kFloat);
+
+  TestEmptyTensor(T);
+
+  TestOut2Basic(T);
+  TestOut2WithScalar(T);
+  TestOut2OldFallback(T);
+  TestOut2MismatchedSizes(T);
+
+  TestOut3Basic(T);
+  TestOut3WithScalar(T);
+  TestOut3OldFallback(T);
+  TestOut3MismatchedSizes(T);
+
+  TestIn2Basic(T);
+  TestIn2WithScalar(T);
+  TestIn2ExpandError(T);
+
+  TestIn3Basic(T);
+  TestIn3WithScalar(T);
+  TestIn3ExpandError(T);
 
-  Type & T = CPU(kFloat);
-
-  // 0) pre-req tests:
-  CATCH_SECTION( "can't expand empty tensor" ) {
-    auto empty = randn({0}, T);
-    _CATCH_REQUIRE_THROWS(empty.expand({3}));
-  }
-
-  // 1) out-place function with 2 args
-  CATCH_SECTION( "out-place function with 2 args" ) {
-
-    CATCH_SECTION( "basic" ) {
-      auto a = randn({3, 1}, T);
-      auto b = randn({5}, T);
-      std::vector<int64_t> expanded_sizes = {3, 5};
-      CATCH_REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto aScalar = ones({1}, T);
-      aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      auto b = randn({3, 5}, T);
-      CATCH_REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
-    }
-
-    CATCH_SECTION( "old fallback behavior yields error" ) {
-      auto a = randn({3, 5}, T);
-      auto b = randn({5, 3}, T);
-      _CATCH_REQUIRE_THROWS(a + b);
-    }
-
-    CATCH_SECTION( "with mismatched sizes" ) {
-      auto a = randn({3, 5}, T);
-      auto b = randn({7, 5}, T);
-      _CATCH_REQUIRE_THROWS(a + b);
-    }
-  }
-
-  CATCH_SECTION( "out-place function with 3 args" ) {
-
-    CATCH_SECTION( "basic" ) {
-      auto a = randn({3, 1, 1}, T);
-      auto b = randn({1, 2, 1}, T);
-      auto c = randn({1, 1, 5}, T);
-      std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      CATCH_REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes)));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto aTensorScalar = ones({1}, T);
-      aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      auto b = randn({3, 2, 1}, T);
-      auto c = randn({1, 2, 5}, T);
-      std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      CATCH_REQUIRE(aTensorScalar.addcmul(b, c).equal(
-                aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes))));
-    }
-
-    CATCH_SECTION( "old fallback behavior yields error" ) {
-      auto a = randn({3, 2, 5}, T);
-      auto b = randn({2, 3, 5}, T);
-      auto c = randn({5, 3, 2}, T);
-      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
-    }
-
-    CATCH_SECTION( "with mismatched sizes" ){
-      auto a = randn({3, 2, 5}, T);
-      auto b = randn({2, 3, 5}, T);
-      auto c = randn({5, 5, 5}, T);
-      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
-    }
-  }
-
-  CATCH_SECTION( "in-place function with 2 args" ) {
-    CATCH_SECTION( "basic" ) {
-      auto a = randn({3, 5}, T);
-      auto b = randn({3, 1}, T);
-      CATCH_REQUIRE((a + b).equal(a + b.expand({3, 5})));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto a = randn({3, 5}, T);
-      auto bScalar = ones({1}, T);
-      bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      CATCH_REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
-    }
-
-    CATCH_SECTION( "error: would have to expand inplace arg" ) {
-      auto a = randn({1, 5}, T);
-      auto b = randn({3, 1}, T);
-      _CATCH_REQUIRE_THROWS(a.add_(b));
-    }
-  }
-
-  CATCH_SECTION( "in-place function with 3 args" ) {
-
-    auto a = randn({3, 5, 2}, T);
-    auto b = randn({3, 1, 2}, T);
-    auto c = randn({1, 5, 1}, T);
-
-    CATCH_SECTION( "basic" ) {
-      auto aClone = a.clone();
-      CATCH_REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto aClone = a.clone();
-      auto bScalar = ones({1}, T);
-      bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      CATCH_REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes()))));
-    }
-
-    CATCH_SECTION( "error: would have to expand inplace arg" ) {
-      auto a = randn({1, 3, 5}, T);
-      auto b = randn({4, 1, 1}, T);
-      auto c = randn({1, 3, 1}, T);
-      _CATCH_REQUIRE_THROWS(a.addcmul_(b, c));
-    }
-  }
-
-  CATCH_SECTION( "explicit dim specification" ) {
-
-    auto a = randn({1}, T);
-    auto b = randn({5, 3}, T);
-    auto c = randn({3, 7}, T);
-
-    CATCH_SECTION( "basic" ) {
-      CATCH_REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c)));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      Tensor aScalar = ones({1}, T);
-      aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      CATCH_REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
-    }
-
-    CATCH_SECTION( "with mismatched sizes" ) {
-      auto a = randn({3, 3}, T);
-      _CATCH_REQUIRE_THROWS(a.addmm(b, c));
-    }
-  }
+  TestExplicitDimBasic(T);
+  TestExplicitDimWithScalar(T);
+  TestExplicitDimWithMismatchedSizes(T);
 }
diff --git a/aten/src/ATen/test/catch_utils.hpp b/aten/src/ATen/test/catch_utils.hpp
index b9b0a87990a9ce..9e7696b1372263 100644
--- a/aten/src/ATen/test/catch_utils.hpp
+++ b/aten/src/ATen/test/catch_utils.hpp
@@ -3,6 +3,8 @@
 #define CATCH_CONFIG_PREFIX_ALL
 #include <catch.hpp>
 
-// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
-// define our own version that doesn't warn.
-#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes
+// warning; define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS(...) \
+  INTERNAL_CATCH_THROWS(           \
+      "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__)
diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu
index cce267100589e1..56ca901931384d 100644
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/cuda/NumericLimits.cuh"
@@ -12,7 +11,6 @@
 using namespace at;
 
 __device__ void test(){
-  
   // test half construction and implicit conversions in device
   assert(Half(3) == Half(3.0f));
   assert(static_cast<Half>(3.0f) == Half(3.0f));
@@ -24,7 +22,7 @@ __device__ void test(){
   __half c = a - Half(b);
   assert(static_cast<Half>(c) == Half(1.0));
 
-  // asserting if the  functions used on 
+  // asserting if the  functions used on
   // half types give almost equivalent results when using
   //  functions on double.
   // The purpose of these asserts are to test the device side
@@ -61,17 +59,18 @@ __device__ void test(){
   assert(::abs(::abs(Half(-3.0)) - ::abs(-3.0f)) <= threshold);
   assert(::abs(::round(Half(2.3)) - ::round(2.3f)) <= threshold);
   assert(::abs(::pow(Half(2.0), Half(10.0)) - ::pow(2.0f, 10.0f)) <= threshold);
-  assert(::abs(::atan2(Half(7.0), Half(0.0)) - ::atan2(7.0f, 0.0f)) <= threshold);
+  assert(
+      ::abs(::atan2(Half(7.0), Half(0.0)) - ::atan2(7.0f, 0.0f)) <= threshold);
   // note: can't use  namespace on isnan and isinf in device code
-  #ifdef _MSC_VER
-    // Windows requires this explicit conversion. The reason is unclear
-    // related issue with clang: https://reviews.llvm.org/D37906
-    assert(::abs(::isnan((float)Half(0.0)) - ::isnan(0.0f)) <= threshold);
-    assert(::abs(::isinf((float)Half(0.0)) - ::isinf(0.0f)) <= threshold);
-  #else
-    assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold);
-    assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold);
-  #endif
+#ifdef _MSC_VER
+  // Windows requires this explicit conversion. The reason is unclear
+  // related issue with clang: https://reviews.llvm.org/D37906
+  assert(::abs(::isnan((float)Half(0.0)) - ::isnan(0.0f)) <= threshold);
+  assert(::abs(::isinf((float)Half(0.0)) - ::isinf(0.0f)) <= threshold);
+#else
+  assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold);
+  assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold);
+#endif
 }
 
 __global__ void kernel(){
@@ -79,12 +78,13 @@ __global__ void kernel(){
 }
 
 void launch_function(){
-  kernel<<<1,1>>>();
+  kernel<<<1, 1>>>();
 }
 
-CATCH_TEST_CASE( "half common math functions tests in device", "[cuda]" ) {
+// half common math functions tests in device
+TEST(HalfCuda, HalfCuda) {
   launch_function();
   cudaError_t err = cudaDeviceSynchronize();
-  CATCH_REQUIRE(err == cudaSuccess);
+  bool isEQ = err == cudaSuccess;
+  ASSERT_TRUE(isEQ);
 }
-
diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu
index b64c530b355914..128e1cf5f5147e 100644
--- a/aten/src/ATen/test/cuda_optional_test.cu
+++ b/aten/src/ATen/test/cuda_optional_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/optional.h"
@@ -8,15 +7,15 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "optional in cuda files", "[cuda]" ) {
+// optional in cuda files
+TEST(OptionalTest, OptionalTestCUDA) {
   at::optional<int64_t> trivially_destructible;
   at::optional<std::vector<int64_t>> non_trivially_destructible;
-  CATCH_REQUIRE(!trivially_destructible.has_value());
-  CATCH_REQUIRE(!non_trivially_destructible.has_value());
+  ASSERT_FALSE(trivially_destructible.has_value());
+  ASSERT_FALSE(non_trivially_destructible.has_value());
 
   trivially_destructible = {5};
   non_trivially_destructible = std::vector<int64_t>{5, 10};
-  CATCH_REQUIRE(trivially_destructible.has_value());
-  CATCH_REQUIRE(non_trivially_destructible.has_value());
+  ASSERT_TRUE(trivially_destructible.has_value());
+  ASSERT_TRUE(non_trivially_destructible.has_value());
 }
-
diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
index a529f38d748a1b..32f5f410bb2eb5 100644
--- a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
+++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -10,9 +9,10 @@
 
 using namespace at;
 
-__global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor<float,1,RestrictPtrTraits> resa,
-						   PackedTensorAccessor<float,2,RestrictPtrTraits> t1a,
-						   PackedTensorAccessor<float,1,RestrictPtrTraits> t2a){
+__global__ void test_tensor_packed_accessor_kernel(
+    PackedTensorAccessor<float, 1, RestrictPtrTraits> resa,
+    PackedTensorAccessor<float, 2, RestrictPtrTraits> t1a,
+    PackedTensorAccessor<float, 1, RestrictPtrTraits> t2a) {
   for (int64_t i = 0; i < resa.size(0); i++) {
     float val = 0.0f;
     for (int64_t j = 0; j < t1a.size(1); j++) {
@@ -22,7 +22,8 @@ __global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor<float,1,
   }
 }
 
-CATCH_TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) {
+// test PackedTensorAccessor and Tensor.packed_accessor
+TEST(PackedtensoraccessorTest, PackedtensoraccessorTestCUDA) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
@@ -35,12 +36,13 @@ CATCH_TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]
   auto resa = res.packed_accessor<float, 1, RestrictPtrTraits>();
 
   auto stream = at::cuda::getCurrentCUDAStream();
-  
+
   test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a);
   cudaError_t err = cudaDeviceSynchronize();
-  CATCH_REQUIRE(err == cudaSuccess);
+  bool isEQ = err == cudaSuccess;
+  ASSERT_TRUE(isEQ);
 
   auto expected = mv(t1, t2);
 
-  CATCH_REQUIRE(res.allclose(expected));
+  ASSERT_TRUE(res.allclose(expected));
 }
diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp
index 7b14174d3baeb3..f5645a7978c11f 100644
--- a/aten/src/ATen/test/cuda_rng_test.cpp
+++ b/aten/src/ATen/test/cuda_rng_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "cuda.h"
@@ -21,7 +20,6 @@ void testCudaRNGMultithread() {
   }
 };
 
-CATCH_TEST_CASE( "CUDA RNG test", "[cuda]" ) {
-  CATCH_SECTION( "multithread" )
-    testCudaRNGMultithread();
+TEST(Cuda_RNGTest, MultithreadRNGTest) {
+  testCudaRNGMultithread();
 }
diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp
index 4391867d166772..54da9420ff60a1 100644
--- a/aten/src/ATen/test/cudnn_test.cpp
+++ b/aten/src/ATen/test/cudnn_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/cudnn/Descriptors.h"
@@ -9,7 +8,7 @@
 using namespace at;
 using namespace at::native;
 
-CATCH_TEST_CASE( "cudnn", "[cuda]" ) {
+TEST(CUDNNTest, CUDNNTestCUDA) {
   manual_seed(123, at::kCUDA);
 
 #if CUDNN_VERSION < 7000
@@ -17,9 +16,12 @@ CATCH_TEST_CASE( "cudnn", "[cuda]" ) {
   DropoutDescriptor desc1, desc2;
   desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42);
   desc2.set(handle, 0.5, desc1.state);
-
-  CATCH_REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout);
-  CATCH_REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates);
-  CATCH_REQUIRE(desc1.desc()->states == desc2.desc()->states);
+  bool isEQ;
+  isEQ = (desc1.desc()->dropout == desc2.desc()->dropout);
+  ASSERT_TRUE(isEQ);
+  isEQ = (desc1.desc()->nstates == desc2.desc()->nstates);
+  ASSERT_TRUE(isEQ);
+  isEQ = (desc1.desc()->states == desc2.desc()->states);
+  ASSERT_TRUE(isEQ);
 #endif
 }
diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp
index bf0cf93f7c4064..71a8d535d01e4e 100644
--- a/aten/src/ATen/test/dlconvertor_test.cpp
+++ b/aten/src/ATen/test/dlconvertor_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -10,18 +9,13 @@
 #include "test_seed.h"
 
 using namespace at;
-
-CATCH_TEST_CASE( "dlconvertor", "[cpu]" ) {
-
+TEST(TestDlconvertor, TestDlconvertor) {
   manual_seed(123, at::kCPU);
 
-  CATCH_INFO( "convert ATen to DLTensor" );
-
-  Tensor a = rand({3,4});
+  Tensor a = rand({3, 4});
   DLManagedTensor* dlMTensor = toDLPack(a);
 
-  CATCH_INFO( "convert DLTensor to ATen" );
   Tensor b = fromDLPack(dlMTensor);
 
-  CATCH_REQUIRE(a.equal(b));
+  ASSERT_TRUE(a.equal(b));
 }
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 32177705a2f883..5aa062f125b21a 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include <ATen/ATen.h>
 #include <iostream>
@@ -12,53 +11,53 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "half arithmetic", "[]" ) {
+TEST(TestHalf, Arithmetic) {
   Half zero = 0;
   Half one = 1;
-  CATCH_REQUIRE(zero + one == one);
-  CATCH_REQUIRE(zero + zero == zero);
-  CATCH_REQUIRE(zero * one == zero);
-  CATCH_REQUIRE(one * one == one);
-  CATCH_REQUIRE(one / one == one);
-  CATCH_REQUIRE(one - one == zero);
-  CATCH_REQUIRE(one - zero == one);
-  CATCH_REQUIRE(zero - one == -one);
-  CATCH_REQUIRE(one + one == Half(2));
-  CATCH_REQUIRE(one + one == 2);
+  ASSERT_EQ(zero + one, one);
+  ASSERT_EQ(zero + zero, zero);
+  ASSERT_EQ(zero * one, zero);
+  ASSERT_EQ(one * one, one);
+  ASSERT_EQ(one / one, one);
+  ASSERT_EQ(one - one, zero);
+  ASSERT_EQ(one - zero, one);
+  ASSERT_EQ(zero - one, -one);
+  ASSERT_EQ(one + one, Half(2));
+  ASSERT_EQ(one + one, 2);
 }
 
-CATCH_TEST_CASE( "half comparisons", "[]" ) {
+TEST(TestHalf, Comparisions) {
   Half zero = 0;
   Half one = 1;
-  CATCH_REQUIRE(zero < one);
-  CATCH_REQUIRE(zero < 1);
-  CATCH_REQUIRE(1 > zero);
-  CATCH_REQUIRE(0 >= zero);
-  CATCH_REQUIRE(0 != one);
-  CATCH_REQUIRE(zero == 0);
-  CATCH_REQUIRE(zero == zero);
-  CATCH_REQUIRE(zero == -zero);
+  ASSERT_LT(zero, one);
+  ASSERT_LT(zero, 1);
+  ASSERT_GT(1, zero);
+  ASSERT_GE(0, zero);
+  ASSERT_NE(0, one);
+  ASSERT_EQ(zero, 0);
+  ASSERT_EQ(zero, zero);
+  ASSERT_EQ(zero, -zero);
 }
 
-CATCH_TEST_CASE( "half cast", "[]" ) {
+TEST(TestHalf, Cast) {
   Half value = 1.5f;
-  CATCH_REQUIRE((int)value == 1);
-  CATCH_REQUIRE((short)value == 1);
-  CATCH_REQUIRE((long long)value == 1LL);
-  CATCH_REQUIRE((float)value == 1.5f);
-  CATCH_REQUIRE((double)value == 1.5);
-  CATCH_REQUIRE((bool)value == true);
-  CATCH_REQUIRE((bool)Half(0.0f) == false);
+  ASSERT_EQ((int)value, 1);
+  ASSERT_EQ((short)value, 1);
+  ASSERT_EQ((long long)value, 1LL);
+  ASSERT_EQ((float)value, 1.5f);
+  ASSERT_EQ((double)value, 1.5);
+  ASSERT_EQ((bool)value, true);
+  ASSERT_EQ((bool)Half(0.0f), false);
 }
 
-CATCH_TEST_CASE( "half construction", "[]" ) {
-  CATCH_REQUIRE(Half((short)3) == Half(3.0f));
-  CATCH_REQUIRE(Half((unsigned short)3) == Half(3.0f));
-  CATCH_REQUIRE(Half(3) == Half(3.0f));
-  CATCH_REQUIRE(Half(3U) == Half(3.0f));
-  CATCH_REQUIRE(Half(3LL) == Half(3.0f));
-  CATCH_REQUIRE(Half(3ULL) == Half(3.0f));
-  CATCH_REQUIRE(Half(3.5) == Half(3.5f));
+TEST(TestHalf, Construction) {
+  ASSERT_EQ(Half((short)3), Half(3.0f));
+  ASSERT_EQ(Half((unsigned short)3), Half(3.0f));
+  ASSERT_EQ(Half(3), Half(3.0f));
+  ASSERT_EQ(Half(3U), Half(3.0f));
+  ASSERT_EQ(Half(3LL), Half(3.0f));
+  ASSERT_EQ(Half(3ULL), Half(3.0f));
+  ASSERT_EQ(Half(3.5), Half(3.5f));
 }
 
 static std::string to_string(const Half& h) {
@@ -67,31 +66,31 @@ static std::string to_string(const Half& h) {
   return ss.str();
 }
 
-CATCH_TEST_CASE( "half to string", "[]" ) {
-  CATCH_REQUIRE(to_string(Half(3.5f)) == "3.5");
-  CATCH_REQUIRE(to_string(Half(-100.0f)) == "-100");
+TEST(TestHalf, Half2String) {
+  ASSERT_EQ(to_string(Half(3.5f)), "3.5");
+  ASSERT_EQ(to_string(Half(-100.0f)), "-100");
 }
 
-CATCH_TEST_CASE( "half numeric limits", "[]" ) {
+TEST(TestHalf, HalfNumericLimits) {
   using limits = std::numeric_limits<Half>;
-  CATCH_REQUIRE(limits::lowest() == -65504.0f);
-  CATCH_REQUIRE(limits::max() == 65504.0f);
-  CATCH_REQUIRE(limits::min() > 0);
-  CATCH_REQUIRE(limits::min() < 1);
-  CATCH_REQUIRE(limits::denorm_min() > 0);
-  CATCH_REQUIRE(limits::denorm_min() / 2  == 0);
-  CATCH_REQUIRE(limits::infinity() == std::numeric_limits<float>::infinity());
-  CATCH_REQUIRE(limits::quiet_NaN() != limits::quiet_NaN());
-  CATCH_REQUIRE(limits::signaling_NaN() != limits::signaling_NaN());
+  ASSERT_EQ(limits::lowest(), -65504.0f);
+  ASSERT_EQ(limits::max(), 65504.0f);
+  ASSERT_GT(limits::min(), 0);
+  ASSERT_LT(limits::min(), 1);
+  ASSERT_GT(limits::denorm_min(), 0);
+  ASSERT_EQ(limits::denorm_min() / 2, 0);
+  ASSERT_EQ(limits::infinity(), std::numeric_limits<float>::infinity());
+  ASSERT_NE(limits::quiet_NaN(), limits::quiet_NaN());
+  ASSERT_NE(limits::signaling_NaN(), limits::signaling_NaN());
 }
 
 // Check the declared type of members of numeric_limits<Half> matches
 // the declared type of that member on numeric_limits<float>
 
-#define ASSERT_SAME_TYPE(name) \
-  static_assert( \
-      std::is_same< \
-          decltype(std::numeric_limits<Half>::name), \
+#define ASSERT_SAME_TYPE(name)                                \
+  static_assert(                                              \
+      std::is_same<                                           \
+          decltype(std::numeric_limits<Half>::name),          \
           decltype(std::numeric_limits<float>::name)>::value, \
       "decltype(" #name ") differs")
 
@@ -119,7 +118,7 @@ ASSERT_SAME_TYPE(max_exponent10);
 ASSERT_SAME_TYPE(traps);
 ASSERT_SAME_TYPE(tinyness_before);
 
-CATCH_TEST_CASE( "half common math functions test", "[]" ) {
+TEST(TestHalf, CommonMath) {
   float threshold = 0.00001;
   assert(std::abs(std::lgamma(Half(10.0)) - std::lgamma(10.0f)) <= threshold);
   assert(std::abs(std::exp(Half(1.0)) - std::exp(1.0f)) <= threshold);
@@ -147,14 +146,22 @@ CATCH_TEST_CASE( "half common math functions test", "[]" ) {
   assert(std::abs(std::erfc(Half(10.0)) - std::erfc(10.0f)) <= threshold);
   assert(std::abs(std::abs(Half(-3.0)) - std::abs(-3.0f)) <= threshold);
   assert(std::abs(std::round(Half(2.3)) - std::round(2.3f)) <= threshold);
-  assert(std::abs(std::pow(Half(2.0), Half(10.0)) - std::pow(2.0f, 10.0f)) <= threshold);
-  assert(std::abs(std::atan2(Half(7.0), Half(0.0)) - std::atan2(7.0f, 0.0f)) <= threshold);
-  #ifdef __APPLE__
-    // @TODO: can macos do implicit conversion of Half?
-    assert(std::abs(std::isnan(static_cast<float>(Half(0.0))) - std::isnan(0.0f)) <= threshold);
-    assert(std::abs(std::isinf(static_cast<float>(Half(0.0))) - std::isinf(0.0f)) <= threshold);
-  #else
-    assert(std::abs(std::isnan(Half(0.0)) - std::isnan(0.0f)) <= threshold);
-    assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold);
-  #endif
-}
\ No newline at end of file
+  assert(
+      std::abs(std::pow(Half(2.0), Half(10.0)) - std::pow(2.0f, 10.0f)) <=
+      threshold);
+  assert(
+      std::abs(std::atan2(Half(7.0), Half(0.0)) - std::atan2(7.0f, 0.0f)) <=
+      threshold);
+#ifdef __APPLE__
+  // @TODO: can macos do implicit conversion of Half?
+  assert(
+      std::abs(std::isnan(static_cast<float>(Half(0.0))) - std::isnan(0.0f)) <=
+      threshold);
+  assert(
+      std::abs(std::isinf(static_cast<float>(Half(0.0))) - std::isinf(0.0f)) <=
+      threshold);
+#else
+  assert(std::abs(std::isnan(Half(0.0)) - std::isnan(0.0f)) <= threshold);
+  assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold);
+#endif
+}
diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu
index d09a423d7ca72d..21169e9ee30625 100644
--- a/aten/src/ATen/test/integer_divider_test.cu
+++ b/aten/src/ATen/test/integer_divider_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or
 // (b-1), so it takes a few minutes to run.
@@ -20,24 +19,25 @@ struct TestCase {
   int steps;
 
   TestCase(Value dividend, int divisor_idx, int steps)
-    : dividend(dividend), divisor_idx(divisor_idx), steps(steps) { }
+      : dividend(dividend), divisor_idx(divisor_idx), steps(steps) {}
 };
 
-template<typename Value>
-__global__ void testIntDivider(const IntDivider<Value> *dividers,
-                               const TestCase<Value> *testCases,
-                               int numCases)
-{
+template <typename Value>
+__global__ void testIntDivider(
+    const IntDivider<Value>* dividers,
+    const TestCase<Value>* testCases,
+    int numCases) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   for (int i = index; i < numCases; i += stride) {
-    const TestCase<Value> &tc = testCases[i];
+    const TestCase<Value>& tc = testCases[i];
     Value dividend = tc.dividend;
-    const IntDivider<Value> &divider = dividers[tc.divisor_idx];
+    const IntDivider<Value>& divider = dividers[tc.divisor_idx];
     Value divisor = divider.divisor;
 
     for (int j = 0; j < tc.steps; j++) {
-      if (sizeof(Value) == 4 && dividend > INT32_MAX) return;
+      if (sizeof(Value) == 4 && dividend > INT32_MAX)
+        return;
 
       DivMod<Value> qr = divider.divmod(dividend);
       assert(qr.div == dividend / divisor && qr.mod == dividend % divisor);
@@ -62,18 +62,22 @@ class IntDividerTester {
     cudaError_t err;
 
     err = cudaMalloc(&dividersBuf_, NUM_CASES * sizeof(IntDivider<Value>));
-    CATCH_REQUIRE(err == cudaSuccess);
+    bool isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
     err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase<Value>));
-    CATCH_REQUIRE(err == cudaSuccess);
+    isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
   }
 
   ~IntDividerTester() {
     cudaError_t err;
 
     err = cudaFree(dividersBuf_);
-    CATCH_REQUIRE(err == cudaSuccess);
+    bool isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
     err = cudaFree(testCasesBuf_);
-    CATCH_REQUIRE(err == cudaSuccess);
+    isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
   }
 
   void addTestCase(Value dividend, Value divisor, int steps) {
@@ -85,29 +89,39 @@ class IntDividerTester {
     testCases_.emplace_back(dividend, dividers_.size() - 1, steps);
 
     // Launch the test kernel if the buffer is full.
-    if (testCases_.size() == NUM_CASES) flush();
+    if (testCases_.size() == NUM_CASES)
+      flush();
   }
 
   void flush() {
     cudaError_t err;
-
-    if (testCases_.empty()) return;
-    CATCH_REQUIRE(!dividers_.empty());
-
-    CATCH_REQUIRE(dividers_.size() <= NUM_CASES);
-    CATCH_REQUIRE(testCases_.size() <= NUM_CASES);
-    err = cudaMemcpy(dividersBuf_, dividers_.data(),
-                     dividers_.size() * sizeof(IntDivider<Value>),
-                     cudaMemcpyHostToDevice);
-    CATCH_REQUIRE(err == cudaSuccess);
-    err = cudaMemcpy(testCasesBuf_, testCases_.data(),
-                     testCases_.size() * sizeof(TestCase<Value>),
-                     cudaMemcpyHostToDevice);
-    CATCH_REQUIRE(err == cudaSuccess);
+    bool isTrue;
+    if (testCases_.empty())
+      return;
+
+    ASSERT_FALSE(dividers_.empty());
+
+    isTrue = dividers_.size() <= NUM_CASES;
+    ASSERT_TRUE(isTrue);
+    isTrue = testCases_.size() <= NUM_CASES;
+    ASSERT_TRUE(isTrue);
+    err = cudaMemcpy(
+        dividersBuf_,
+        dividers_.data(),
+        dividers_.size() * sizeof(IntDivider<Value>),
+        cudaMemcpyHostToDevice);
+    isTrue = err == cudaSuccess;
+    ASSERT_TRUE(isTrue);
+    err = cudaMemcpy(
+        testCasesBuf_,
+        testCases_.data(),
+        testCases_.size() * sizeof(TestCase<Value>),
+        cudaMemcpyHostToDevice);
+    isTrue = err == cudaSuccess;
+    ASSERT_TRUE(isTrue);
 
     int numCases = testCases_.size();
-    testIntDivider<Value><<<512, 512>>>(
-      dividersBuf_, testCasesBuf_, numCases);
+    testIntDivider<Value><<<512, 512>>>(dividersBuf_, testCasesBuf_, numCases);
 
     dividers_.clear();
     testCases_.clear();
@@ -117,8 +131,8 @@ class IntDividerTester {
   vector<IntDivider<Value>> dividers_;
   vector<TestCase<Value>> testCases_;
 
-  IntDivider<Value> *dividersBuf_;
-  TestCase<Value> *testCasesBuf_;
+  IntDivider<Value>* dividersBuf_;
+  TestCase<Value>* testCasesBuf_;
 };
 
 static void testUint32Divider()
@@ -128,15 +142,18 @@ static void testUint32Divider()
   IntDividerTester<uint32_t> tester;
 
   for (uint64_t divisor = 1; divisor <= INT32_MAX; divisor++) {
-    if (divisor < 1000000 && divisor % 10000 == 0) fprintf(stderr, ".");
-    if (divisor % 10000000 == 0) fprintf(stderr, "-");
+    if (divisor < 1000000 && divisor % 10000 == 0)
+      fprintf(stderr, ".");
+    if (divisor % 10000000 == 0)
+      fprintf(stderr, "-");
 
     // In order to save time, we only test when the remainder is zero or
     // (divisor - 1).
     uint64_t dividend = 0;
     while (dividend <= INT32_MAX) {
       uint64_t steps = (INT32_MAX - dividend) / divisor + 1;
-      if (steps > MAX_STEPS) steps = MAX_STEPS;
+      if (steps > MAX_STEPS)
+        steps = MAX_STEPS;
 
       tester.addTestCase(dividend, divisor, steps);
       tester.addTestCase(dividend + divisor - 1, divisor, steps);
@@ -180,11 +197,11 @@ static void testUint64Divider()
   tester.flush();
 }
 
-CATCH_TEST_CASE( "CUDA integer divider", "[cuda]" ) {
-
+TEST(TestCUDAIntegerDivider, IntegerDivider) {
   testUint64Divider();
   testUint32Divider();
 
   cudaError_t err = cudaDeviceSynchronize();
-  CATCH_REQUIRE(err == cudaSuccess);
+  bool isTrue = err == cudaSuccess;
+  ASSERT_TRUE(isTrue);
 }
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index 4c57b7d8ee1d96..6721c69b0e0f36 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -1,192 +1,222 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-using Catch::Matchers::StartsWith;
+#define ASSERT_EQUAL(t1, t2) ASSERT_TRUE(t1.equal(t2));
 
-#define REQUIRE_EQUAL(t1, t2) \
-  CATCH_REQUIRE(t1.equal(t2));
+#define ASSERT_ALLCLOSE(t1, t2)     \
+  ASSERT_TRUE(t1.is_same_size(t2)); \
+  ASSERT_TRUE(t1.allclose(t2));
 
-#define REQUIRE_ALLCLOSE(t1, t2)   \
-  CATCH_REQUIRE(t1.is_same_size(t2));    \
-  CATCH_REQUIRE(t1.allclose(t2));
-
-#define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol)   \
-  CATCH_REQUIRE(t1.is_same_size(t2));    \
-  CATCH_REQUIRE(t1.allclose(t2, atol, rtol));
+#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \
+  ASSERT_TRUE(t1.is_same_size(t2));                    \
+  ASSERT_TRUE(t1.allclose(t2, atol, rtol));
 
 void requireEqualTensorList(TensorList t1, TensorList t2) {
-  CATCH_REQUIRE(t1.size() == t2.size());
+  ASSERT_EQ(t1.size(), t2.size());
   for (size_t i = 0; i < t1.size(); ++i) {
-    REQUIRE_EQUAL(t1[ i ], t2[ i ]);
+    ASSERT_EQUAL(t1[i], t2[i]);
   }
 }
 
-void test(Type & T, Type & AccT) {
-  auto t = randn({3, 3}, T);
-
-  CATCH_SECTION( "split: test method, type, namespace give same result" ) {
-    auto splitMethod = t.split(1, 0);
-    auto splitType = T.split(t, 1, 0);
-    auto splitNs = at::split(t, 1, 0);
-    requireEqualTensorList(splitMethod, splitType);
-    requireEqualTensorList(splitMethod, splitNs);
+// split: test method, type, namespace give same result
+void TestSplit(Type& T, Tensor& t) {
+  auto splitMethod = t.split(1, 0);
+  auto splitType = T.split(t, 1, 0);
+  auto splitNs = at::split(t, 1, 0);
+  requireEqualTensorList(splitMethod, splitType);
+  requireEqualTensorList(splitMethod, splitNs);
 
-    // test rebuilding with cat
-    REQUIRE_EQUAL(at::cat(splitMethod, 0), t);
-  }
+  // test rebuilding with cat
+  ASSERT_EQUAL(at::cat(splitMethod, 0), t);
+}
 
-  CATCH_SECTION( "chunk: test method, type, namespace give same result" ) {
-    // test method, type, namespace give same result
-    auto chunkMethod = t.chunk(3, 0);
-    auto chunkType = T.chunk(t, 3, 0);
-    auto chunkNs = at::chunk(t, 3, 0);
-    requireEqualTensorList(chunkMethod, chunkType);
-    requireEqualTensorList(chunkMethod, chunkNs);
+// chunk: test method, type, namespace give same result
+void TestChunk(Type& T, Tensor& t) {
+  // test method, type, namespace give same result
+  auto chunkMethod = t.chunk(3, 0);
+  auto chunkType = T.chunk(t, 3, 0);
+  auto chunkNs = at::chunk(t, 3, 0);
+  requireEqualTensorList(chunkMethod, chunkType);
+  requireEqualTensorList(chunkMethod, chunkNs);
+
+  // test rebuilding with cat
+  ASSERT_EQUAL(at::cat(chunkMethod, 0), t);
+}
 
-    // test rebuilding with cat
-    REQUIRE_EQUAL(at::cat(chunkMethod, 0), t);
+void TestStack(Type& T, Tensor& t) {
+  auto x = rand({2, 3, 4});
+  auto y = rand({2, 3, 4});
+  auto z = rand({2, 3, 4});
+  for (int64_t dim = 0; dim < 4; ++dim) {
+    auto res = at::stack({x, y, z}, dim);
+    auto res_neg = at::stack({x, y, z}, dim - 4);
+    std::vector<int64_t> expected_size;
+    expected_size.insert(
+        expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim);
+    expected_size.insert(expected_size.end(), 3);
+    expected_size.insert(
+        expected_size.end(), x.sizes().begin() + dim, x.sizes().end());
+
+    ASSERT_EQUAL(res, res_neg);
+    ASSERT_TRUE(res.sizes().equals(expected_size));
+    ASSERT_EQUAL(res.select(dim, 0), x);
+    ASSERT_EQUAL(res.select(dim, 1), y);
+    ASSERT_EQUAL(res.select(dim, 2), z);
   }
+}
 
-  // stack
-  CATCH_SECTION( "stack" ) {
-    auto x = rand({2, 3, 4});
-    auto y = rand({2, 3, 4});
-    auto z = rand({2, 3, 4});
-    for (int64_t dim = 0; dim < 4; ++dim) {
-      auto res = at::stack({x, y, z}, dim);
-      auto res_neg = at::stack({x, y, z}, dim - 4);
-      std::vector<int64_t> expected_size;
-      expected_size.insert(expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim);
-      expected_size.insert(expected_size.end(), 3);
-      expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end());
-
-      REQUIRE_EQUAL(res, res_neg);
-      CATCH_REQUIRE(res.sizes().equals(expected_size));
-      REQUIRE_EQUAL(res.select(dim, 0), x);
-      REQUIRE_EQUAL(res.select(dim, 1), y);
-      REQUIRE_EQUAL(res.select(dim, 2), z);
-    }
-  }
+// size / stride
+void TestSize(Type& T, Tensor& t) {
+  auto scalar = randn({}, T);
+  // Throw StartsWith("dimension specified as 0 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.size(0));
+  // Throw StartsWith("dimension specified as -1 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.size(-1));
+  // Throw StartsWith("dimension specified as 0 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.stride(0));
+  // Throw StartsWith("dimension specified as -1 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.stride(-1));
+
+  auto empty = randn({0}, T);
+  ASSERT_EQ(empty.size(0), 0);
+  ASSERT_EQ(empty.size(-1), 0);
+  ASSERT_EQ(empty.stride(0), 1);
+  ASSERT_EQ(empty.stride(-1), 1);
+}
 
-  CATCH_SECTION( "size / stride" ) {
-    auto scalar = randn({}, T);
-    CATCH_REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    CATCH_REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
-    CATCH_REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    CATCH_REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
-
-    auto empty = randn({0}, T);
-    CATCH_REQUIRE(empty.size(0) == 0);
-    CATCH_REQUIRE(empty.size(-1) == 0);
-    CATCH_REQUIRE(empty.stride(0) == 1);
-    CATCH_REQUIRE(empty.stride(-1) == 1);
-  }
+void TestMatmul(Type& T, Tensor& t, Type& AccT) {
+  auto scalar = randn({}, T);
+  auto d1 = randn({3}, T);
+  auto d2 = randn({2, 3}, T);
+
+  // 0-d
+  // Throw StartsWith("both arguments to matmul need to be at least 1D")
+  ASSERT_ANY_THROW(scalar.matmul(d2));
+  // Throw StartsWith("both arguments to matmul need to be at least 1D")
+  ASSERT_ANY_THROW(d2.matmul(scalar));
+
+  // 1-d
+  ASSERT_ALLCLOSE(d1.matmul(d1), d1.dot(d1));
+  ASSERT_ALLCLOSE(d2.matmul(d1), d2.mv(d1));
+  auto d1o = randn({2}, T);
+  ASSERT_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0));
+
+  // 2-d
+  auto d2o = randn({3, 5}, T);
+  ASSERT_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o));
+
+  // > 2-d, 1-d
+  auto d3 = randn({5, 2, 3}, T);
+  ASSERT_ALLCLOSE(
+      d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2}));
+  ASSERT_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3}));
+
+  auto d5 = randn({3, 2, 4, 2, 3}, T);
+  ASSERT_ALLCLOSE(
+      d5.matmul(d1),
+      d5.view({24, 2, 3})
+          .bmm(d1.view({1, 3, 1}).expand({24, 3, 1}))
+          .view({3, 2, 4, 2}));
+  ASSERT_ALLCLOSE(
+      d1o.matmul(d5),
+      d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3}));
+
+  // > 2-d, 2-d
+  // we use a "folding" algorithm in this case of matmul, so the direct
+  // comparison to bmm doesn't work; instead, compare to the higher precision
+  // computation (technically, we should always do this). Tolerances are
+  // selected empirically.
+  double atol = 1e-04;
+  double rtol = 1e-06;
+  d2 = randn({3, 4}, T);
+  d2o = randn({4, 2}, T);
+  auto result = d5.matmul(d2).toType(AccT);
+
+  auto d5Acc = d5.toType(AccT);
+  auto d2Acc = d2.toType(AccT);
+  auto acc_result = d5Acc.view({24, 2, 3})
+                        .bmm(d2Acc.expand({24, 3, 4}))
+                        .view({3, 2, 4, 2, 4});
+  ASSERT_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol);
+  ASSERT_ALLCLOSE(
+      d2o.matmul(d5),
+      d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3}));
+
+  // > 2-d, > 2-d
+  auto d5o = randn({2, 1, 2, 4, 3, 2}, T);
+  auto d5_bmm_view =
+      d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3});
+  auto d5o_bmm_view =
+      d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2});
+  ASSERT_ALLCLOSE(
+      d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2}));
+
+  // non-expandable case
+  auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T);
+  // Throw Contains("must match the size")
+  ASSERT_ANY_THROW(d5.matmul(d5wrong));
+}
 
-  // matmul
-  CATCH_SECTION( "matmul" ) {
-    auto scalar = randn({}, T);
-    auto d1 = randn({3}, T);
-    auto d2 = randn({2, 3}, T);
-
-    // 0-d
-    CATCH_REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
-    CATCH_REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
-
-    // 1-d
-    REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1));
-    REQUIRE_ALLCLOSE(d2.matmul(d1), d2.mv(d1));
-    auto d1o = randn({2}, T);
-    REQUIRE_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0));
-
-    // 2-d
-    auto d2o = randn({3, 5}, T);
-    REQUIRE_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o));
-
-    // > 2-d, 1-d
-    auto d3 = randn({5, 2, 3}, T);
-    REQUIRE_ALLCLOSE(d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2}));
-    REQUIRE_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3}));
-
-    auto d5 = randn({3, 2, 4, 2, 3}, T);
-    REQUIRE_ALLCLOSE(d5.matmul(d1), d5.view({24, 2, 3}).bmm(d1.view({1, 3, 1}).expand({24, 3, 1})).view({3, 2, 4, 2}));
-    REQUIRE_ALLCLOSE(d1o.matmul(d5), d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3}));
-
-    // > 2-d, 2-d
-    // we use a "folding" algorithm in this case of matmul, so the direct comparison to bmm doesn't work;
-    // instead, compare to the higher precision computation (technically, we should always do this).
-    // Tolerances are selected empirically.
-    double atol = 1e-04;
-    double rtol = 1e-06;
-    d2 = randn({3, 4}, T);
-    d2o = randn({4, 2}, T);
-    auto result = d5.matmul(d2).toType(AccT);
-
-    auto d5Acc = d5.toType(AccT);
-    auto d2Acc = d2.toType(AccT);
-    auto acc_result = d5Acc.view({24, 2, 3}).bmm(d2Acc.expand({24, 3, 4})).view({3, 2, 4, 2, 4});
-    REQUIRE_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol);
-    REQUIRE_ALLCLOSE(d2o.matmul(d5), d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3}));
-
-    // > 2-d, > 2-d
-    auto d5o = randn({2, 1, 2, 4, 3, 2}, T);
-    auto d5_bmm_view = d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3});
-    auto d5o_bmm_view = d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2});
-    REQUIRE_ALLCLOSE(d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2}));
-
-    // non-expandable case
-    auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T);
-    CATCH_REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size"));
-  }
+void TestStandardGammaGrad(Type& T, Tensor& t) {
+  // check empty
+  auto empty = ones({0}, T);
+  ASSERT_EQUAL(empty, at::_standard_gamma_grad(empty, empty));
+
+  // check scalar equals one element
+  auto one_scalar = ones({}, T).mul(5);
+  auto one_with_dim = ones({1}, T).mul(5);
+  ASSERT_ALLCLOSE(
+      at::_standard_gamma_grad(one_scalar, one_scalar),
+      at::_standard_gamma_grad(one_with_dim, one_with_dim).sum());
+
+  // check mixing types
+  auto t1 = randn({3, 4}, T);
+  auto t2 = randn({3, 4}, T).toType(kDouble);
+  // Throw StartsWith("expected scalar type")
+  ASSERT_ANY_THROW(at::_standard_gamma_grad(t1, t2));
+}
 
-  // _standard_gamma_grad
-  CATCH_SECTION( "_standard_gamma_grad" ) {
-    // check empty
-    auto empty = ones({0}, T);
-    REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty));
-
-    // check scalar equals one element
-    auto one_scalar = ones({}, T).mul(5);
-    auto one_with_dim = ones({1}, T).mul(5);
-    REQUIRE_ALLCLOSE(at::_standard_gamma_grad(one_scalar, one_scalar),
-		     at::_standard_gamma_grad(one_with_dim, one_with_dim).sum());
-
-    // check mixing types
-    auto t1 = randn({3, 4}, T);
-    auto t2 = randn({3, 4}, T).toType(kDouble);
-    CATCH_REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type"));
-  }
+void TestWhere(Type& T, Tensor& t) {
+  // empty
+  auto empty = ones({0}, T);
+  auto& bT = T.toScalarType(ScalarType::Byte);
+  auto empty_byte = ones({0}, bT);
+  ASSERT_EQUAL(empty, at::where(empty_byte, empty, empty));
+
+  // check scalar equals one element
+  auto x_scalar = ones({}, T).mul(5);
+  auto y_scalar = ones({}, T).mul(7);
+  auto cond_scalar = zeros({}, bT);
+  auto x_1d = x_scalar.unsqueeze(0);
+  auto y_1d = y_scalar.unsqueeze(0);
+  auto cond_1d = cond_scalar.unsqueeze(0);
+  ASSERT_ALLCLOSE(
+      at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0),
+      at::where(cond_1d, x_1d, y_1d));
+}
 
-  CATCH_SECTION( "where" ) {
-    // empty
-    auto empty = ones({0}, T);
-    auto &bT = T.toScalarType(ScalarType::Byte);
-    auto empty_byte = ones({0}, bT);
-    REQUIRE_EQUAL(empty, at::where(empty_byte, empty, empty));
-
-    // check scalar equals one element
-    auto x_scalar = ones({}, T).mul(5);
-    auto y_scalar = ones({}, T).mul(7);
-    auto cond_scalar = zeros({}, bT);
-    auto x_1d = x_scalar.unsqueeze(0);
-    auto y_1d = y_scalar.unsqueeze(0);
-    auto cond_1d = cond_scalar.unsqueeze(0);
-    REQUIRE_ALLCLOSE(at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0),
-                     at::where(cond_1d, x_1d, y_1d));
-  }
+void test(Type& T, Type& AccT) {
+  auto t = randn({3, 3}, T);
+  TestSplit(T, t);
+  TestChunk(T, t);
+  TestStack(T, t);
+  TestSize(T, t);
+  TestMatmul(T, t, AccT);
+  TestStandardGammaGrad(T, t);
+  TestWhere(T, t);
 }
 
-CATCH_TEST_CASE( "native test CPU", "[cpu]" ) {
+TEST(TestNative, NativeTestCPU) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat), CPU(kDouble));
 }
 
-CATCH_TEST_CASE( "native test CUDA", "[cuda]" ) {
+TEST(TestNative, NativeTestGPU) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index a89ca81da017f7..5bb3aafaff9247 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -9,31 +8,33 @@
 
 using namespace at;
 
-#define TRY_CATCH_ELSE(fn, catc, els)                           \
-  {                                                             \
-    /* avoid mistakenly passing if els code throws exception*/  \
-    bool _passed = false;                                       \
-    try {                                                       \
-      fn;                                                       \
-      _passed = true;                                           \
-      els;                                                      \
-    } catch (std::exception &e) {                               \
-      CATCH_REQUIRE(!_passed);                                        \
-      catc;                                                     \
-    }                                                           \
+#define TRY_CATCH_ELSE(fn, catc, els)                          \
+  {                                                            \
+    /* avoid mistakenly passing if els code throws exception*/ \
+    bool _passed = false;                                      \
+    try {                                                      \
+      fn;                                                      \
+      _passed = true;                                          \
+      els;                                                     \
+    } catch (std::exception & e) {                             \
+      ASSERT_FALSE(_passed);                                   \
+      catc;                                                    \
+    }                                                          \
   }
 
 void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) {
-  CATCH_REQUIRE(lhs.dim() == rhs.dim());
-  CATCH_REQUIRE(lhs.sizes().equals(rhs.sizes()));
+  ASSERT_EQ(lhs.dim(), rhs.dim());
+  ASSERT_TRUE(lhs.sizes().equals(rhs.sizes()));
 }
 
 bool should_expand(const IntList &from_size, const IntList &to_size) {
-  if(from_size.size() > to_size.size()) {
+  if (from_size.size() > to_size.size()) {
     return false;
   }
-  for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend(); ++from_dim_it) {
-    for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend(); ++to_dim_it) {
+  for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend();
+       ++from_dim_it) {
+    for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend();
+         ++to_dim_it) {
       if (*from_dim_it != 1 && *from_dim_it != *to_dim_it) {
         return false;
       }
@@ -43,21 +44,22 @@ bool should_expand(const IntList &from_size, const IntList &to_size) {
 }
 
 void test(Type &T) {
-  std::vector<std::vector<int64_t> > sizes = { {}, {0}, {1}, {1, 1}, {2}};
+  std::vector<std::vector<int64_t>> sizes = {{}, {0}, {1}, {1, 1}, {2}};
 
   // single-tensor/size tests
   for (auto s = sizes.begin(); s != sizes.end(); ++s) {
     // verify that the dim, sizes, strides, etc match what was requested.
     auto t = ones(*s, T);
-    CATCH_REQUIRE((size_t)t.dim() == s->size());
-    CATCH_REQUIRE((size_t)t.ndimension() == s->size());
-    CATCH_REQUIRE(t.sizes().equals(*s));
-    CATCH_REQUIRE(t.strides().size() == s->size());
-    auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies<int64_t>());
-    CATCH_REQUIRE(t.numel() == numel);
+    ASSERT_EQ((size_t)t.dim(), s->size());
+    ASSERT_EQ((size_t)t.ndimension(), s->size());
+    ASSERT_TRUE(t.sizes().equals(*s));
+    ASSERT_EQ(t.strides().size(), s->size());
+    auto numel =
+        std::accumulate(s->begin(), s->end(), 1, std::multiplies<int64_t>());
+    ASSERT_EQ(t.numel(), numel);
     // verify we can output
     std::stringstream ss;
-    CATCH_REQUIRE_NOTHROW(ss << t << std::endl);
+    ASSERT_NO_THROW(ss << t << std::endl);
 
     // set_
     auto t2 = ones(*s, T);
@@ -65,22 +67,22 @@ void test(Type &T) {
     require_equal_size_dim(t2, ones({0}, T));
 
     // unsqueeze
-    CATCH_REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
+    ASSERT_EQ(t.unsqueeze(0).dim(), t.dim() + 1);
 
     // unsqueeze_
     {
       auto t2 = ones(*s, T);
       auto r = t2.unsqueeze_(0);
-      CATCH_REQUIRE(r.dim() == t.dim() + 1);
+      ASSERT_EQ(r.dim(), t.dim() + 1);
     }
 
     // squeeze (with dimension argument)
     if (t.dim() == 0 || t.sizes()[0] == 1) {
-      CATCH_REQUIRE(t.squeeze(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(t.squeeze(0).dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
-      // in NumPy this is an error.
-      CATCH_REQUIRE(t.squeeze(0).dim() == t.dim());
+      // In PyTorch, it is a no-op to try to squeeze a dimension that has size
+      // != 1; in NumPy this is an error.
+      ASSERT_EQ(t.squeeze(0).dim(), t.dim());
     }
 
     // squeeze (with no dimension argument)
@@ -98,12 +100,12 @@ void test(Type &T) {
     {
       // squeeze_ (with dimension argument)
       auto t2 = ones(*s, T);
-      if (t2.dim() == 0 ||  t2.sizes()[0] == 1) {
-        CATCH_REQUIRE(t2.squeeze_(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      if (t2.dim() == 0 || t2.sizes()[0] == 1) {
+        ASSERT_EQ(t2.squeeze_(0).dim(), std::max<int64_t>(t.dim() - 1, 0));
       } else {
-        // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
-        // in NumPy this is an error.
-        CATCH_REQUIRE(t2.squeeze_(0).dim() == t.dim());
+        // In PyTorch, it is a no-op to try to squeeze a dimension that has size
+        // != 1; in NumPy this is an error.
+        ASSERT_EQ(t2.squeeze_(0).dim(), t.dim());
       }
     }
 
@@ -122,154 +124,156 @@ void test(Type &T) {
 
     // reduce (with dimension argument and with 1 return argument)
     if (t.numel() != 0) {
-      CATCH_REQUIRE(t.sum(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(t.sum(0).dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      CATCH_REQUIRE(t.sum(0).equal(at::zeros({}, T)));
+      ASSERT_TRUE(t.sum(0).equal(at::zeros({}, T)));
     }
 
     // reduce (with dimension argument and with 2 return arguments)
     if (t.numel() != 0) {
       auto ret = t.min(0);
-      CATCH_REQUIRE(std::get<0>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
-      CATCH_REQUIRE(std::get<1>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(std::get<0>(ret).dim(), std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(std::get<1>(ret).dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      _CATCH_REQUIRE_THROWS(t.min(0));
+      ASSERT_ANY_THROW(t.min(0));
     }
 
     // simple indexing
     if (t.dim() > 0 && t.numel() != 0) {
-      CATCH_REQUIRE(t[0].dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(t[0].dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      _CATCH_REQUIRE_THROWS(t[0]);
+      ASSERT_ANY_THROW(t[0]);
     }
 
     // fill_ (argument to fill_ can only be a 0-dim tensor)
-    TRY_CATCH_ELSE(t.fill_(t.sum(0)),
-                   CATCH_REQUIRE(t.dim() > 1),
-                   CATCH_REQUIRE(t.dim() <= 1));
+    TRY_CATCH_ELSE(
+        t.fill_(t.sum(0)), ASSERT_GT(t.dim(), 1), ASSERT_LE(t.dim(), 1));
   }
 
   for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) {
     for (auto rhs_it = sizes.begin(); rhs_it != sizes.end(); ++rhs_it) {
       // is_same_size should only match if they are the same shape
       {
-          auto lhs = ones(*lhs_it, T);
-          auto rhs = ones(*rhs_it, T);
-          if(*lhs_it != *rhs_it) {
-            CATCH_REQUIRE(!lhs.is_same_size(rhs));
-            CATCH_REQUIRE(!rhs.is_same_size(lhs));
-          }
-      }
-      // forced size functions (resize_, resize_as, set_)
-      {
-        // resize_
-        {
-          auto lhs = ones(*lhs_it, T);
-          auto rhs = ones(*rhs_it, T);
-          lhs.resize_(*rhs_it);
-          require_equal_size_dim(lhs, rhs);
-        }
-        // resize_as_
-        {
-          auto lhs = ones(*lhs_it, T);
-          auto rhs = ones(*rhs_it, T);
-          lhs.resize_as_(rhs);
-          require_equal_size_dim(lhs, rhs);
-        }
-        // set_
-        {
-          {
-            // with tensor
-            auto lhs = ones(*lhs_it, T);
-            auto rhs = ones(*rhs_it, T);
-            lhs.set_(rhs);
-            require_equal_size_dim(lhs, rhs);
-          }
-          {
-            // with storage
-            auto lhs = ones(*lhs_it, T);
-            auto rhs = ones(*rhs_it, T);
-            auto storage = T.storage(rhs.numel(), false);
-            lhs.set_(storage);
-            // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars
-            CATCH_REQUIRE(lhs.dim() != 0);
-          }
-          {
-            // with storage, offset, sizes, strides
-            auto lhs = ones(*lhs_it, T);
-            auto rhs = ones(*rhs_it, T);
-            auto storage = T.storage(rhs.numel(), false);
-            lhs.set_(storage, rhs.storage_offset(), rhs.sizes(), rhs.strides());
-            require_equal_size_dim(lhs, rhs);
-          }
+        auto lhs = ones(*lhs_it, T);
+        auto rhs = ones(*rhs_it, T);
+        if (*lhs_it != *rhs_it) {
+          ASSERT_FALSE(lhs.is_same_size(rhs));
+          ASSERT_FALSE(rhs.is_same_size(lhs));
         }
       }
-
-      // view
+      // forced size functions (resize_, resize_as, set_)
+      {// resize_
+       {auto lhs = ones(*lhs_it, T);
+      auto rhs = ones(*rhs_it, T);
+      lhs.resize_(*rhs_it);
+      require_equal_size_dim(lhs, rhs);
+    }
+    // resize_as_
+    {
+      auto lhs = ones(*lhs_it, T);
+      auto rhs = ones(*rhs_it, T);
+      lhs.resize_as_(rhs);
+      require_equal_size_dim(lhs, rhs);
+    }
+    // set_
+    {
       {
+        // with tensor
         auto lhs = ones(*lhs_it, T);
         auto rhs = ones(*rhs_it, T);
-        auto rhs_size = *rhs_it;
-        TRY_CATCH_ELSE(auto result = lhs.view(rhs_size),
-                       CATCH_REQUIRE(lhs.numel() != rhs.numel()),
-                       CATCH_REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs););
+        lhs.set_(rhs);
+        require_equal_size_dim(lhs, rhs);
       }
-
-      // take
       {
+        // with storage
         auto lhs = ones(*lhs_it, T);
-        auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long);
-        TRY_CATCH_ELSE(auto result = lhs.take(rhs),
-                       CATCH_REQUIRE(lhs.numel() == 0); CATCH_REQUIRE(rhs.numel() != 0),
-                       require_equal_size_dim(result, rhs));
+        auto rhs = ones(*rhs_it, T);
+        auto storage = T.storage(rhs.numel(), false);
+        lhs.set_(storage);
+        // should not be dim 0 because an empty storage is dim 1; all other
+        // storages aren't scalars
+        ASSERT_NE(lhs.dim(), 0);
       }
-
-
-      // ger
       {
+        // with storage, offset, sizes, strides
         auto lhs = ones(*lhs_it, T);
         auto rhs = ones(*rhs_it, T);
-        TRY_CATCH_ELSE(auto result = lhs.ger(rhs),
-                       CATCH_REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)),
-                       [&]() {
-                         int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
-                         int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
-                         require_equal_size_dim(result, at::empty({dim0, dim1}, result.options()));
-                       }(););
+        auto storage = T.storage(rhs.numel(), false);
+        lhs.set_(storage, rhs.storage_offset(), rhs.sizes(), rhs.strides());
+        require_equal_size_dim(lhs, rhs);
       }
+    }
+  }
 
-      // expand
-      {
-        auto lhs = ones(*lhs_it, T);
-        auto lhs_size = *lhs_it;
-        auto rhs = ones(*rhs_it, T);
-        auto rhs_size = *rhs_it;
-        bool should_pass = should_expand(lhs_size, rhs_size);
-        TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size),
-                       CATCH_REQUIRE(!should_pass),
-                       CATCH_REQUIRE(should_pass); require_equal_size_dim(result, rhs););
+  // view
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto rhs = ones(*rhs_it, T);
+    auto rhs_size = *rhs_it;
+    TRY_CATCH_ELSE(auto result = lhs.view(rhs_size),
+                   ASSERT_NE(lhs.numel(), rhs.numel()),
+                   ASSERT_EQ(lhs.numel(), rhs.numel());
+                   require_equal_size_dim(result, rhs););
+  }
 
-        // in-place functions (would be good if we can also do a non-broadcasting one, b/c
-        // broadcasting functions will always end up operating on tensors of same size;
-        // is there an example of this outside of assign_ ?)
-        {
-          bool should_pass_inplace = should_expand(rhs_size, lhs_size);
-          TRY_CATCH_ELSE(lhs.add_(rhs),
-                         CATCH_REQUIRE(!should_pass_inplace),
-                         CATCH_REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T)););
-        }
-      }
+  // take
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long);
+    TRY_CATCH_ELSE(auto result = lhs.take(rhs), ASSERT_EQ(lhs.numel(), 0);
+                   ASSERT_NE(rhs.numel(), 0),
+                   require_equal_size_dim(result, rhs));
+  }
+
+  // ger
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto rhs = ones(*rhs_it, T);
+    TRY_CATCH_ELSE(auto result = lhs.ger(rhs),
+                   ASSERT_TRUE(
+                       (lhs.numel() == 0 || rhs.numel() == 0 ||
+                        lhs.dim() != 1 || rhs.dim() != 1)),
+                   [&]() {
+                     int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
+                     int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
+                     require_equal_size_dim(
+                         result, at::empty({dim0, dim1}, result.options()));
+                   }(););
+  }
+
+  // expand
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto lhs_size = *lhs_it;
+    auto rhs = ones(*rhs_it, T);
+    auto rhs_size = *rhs_it;
+    bool should_pass = should_expand(lhs_size, rhs_size);
+    TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size),
+                   ASSERT_FALSE(should_pass),
+                   ASSERT_TRUE(should_pass);
+                   require_equal_size_dim(result, rhs););
+
+    // in-place functions (would be good if we can also do a non-broadcasting
+    // one, b/c broadcasting functions will always end up operating on tensors
+    // of same size; is there an example of this outside of assign_ ?)
+    {
+      bool should_pass_inplace = should_expand(rhs_size, lhs_size);
+      TRY_CATCH_ELSE(lhs.add_(rhs),
+                     ASSERT_FALSE(should_pass_inplace),
+                     ASSERT_TRUE(should_pass_inplace);
+                     require_equal_size_dim(lhs, ones(*lhs_it, T)););
     }
   }
 }
+}
+}
 
-CATCH_TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
+TEST(TestScalarTensor, TestScalarTensorCPU) {
   manual_seed(123, at::kCPU);
-
   test(CPU(kFloat));
 }
 
-CATCH_TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
+TEST(TestScalarTensor, TestScalarTensorCUDA) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 10ffa9afc326ff..b188146f213f56 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include <iostream>
 // define constants like M_PI and C keywords for MSVC
@@ -33,26 +32,25 @@ struct Foo<Half> {
 
 void test_overflow() {
   auto s1 = Scalar(M_PI);
-  CATCH_REQUIRE(s1.toFloat() == static_cast<float>(M_PI));
+  ASSERT_EQ(s1.toFloat(), static_cast<float>(M_PI));
   s1.toHalf();
 
   s1 = Scalar(100000);
-  CATCH_REQUIRE(s1.toFloat() == 100000.0);
-  CATCH_REQUIRE(s1.toInt() == 100000);
+  ASSERT_EQ(s1.toFloat(), 100000.0);
+  ASSERT_EQ(s1.toInt(), 100000);
 
-  CATCH_REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error);
+  ASSERT_THROW(s1.toHalf(), std::domain_error);
 
   s1 = Scalar(NAN);
-  CATCH_REQUIRE(std::isnan(s1.toFloat()));
-  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  ASSERT_TRUE(std::isnan(s1.toFloat()));
+  ASSERT_THROW(s1.toInt(), std::domain_error);
 
   s1 = Scalar(INFINITY);
-  CATCH_REQUIRE(std::isinf(s1.toFloat()));
-  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  ASSERT_TRUE(std::isinf(s1.toFloat()));
+  ASSERT_THROW(s1.toInt(), std::domain_error);
 }
 
-CATCH_TEST_CASE( "scalar test", "[]" ) {
-
+TEST(TestScalar, TestScalar) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
@@ -60,54 +58,57 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
   Scalar bar = 3.0;
   Half h = bar.toHalf();
   Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() <<  "\n";
-  Generator & gen = at::globalContext().defaultGenerator(at::kCPU);
-  CATCH_REQUIRE_NOTHROW(gen.seed());
-  auto && C = at::globalContext();
-  if(at::hasCUDA()) {
-    auto t2 = zeros({4,4}, at::kCUDA);
+  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
+       << bar.toDouble() << " " << what.isIntegral() << "\n";
+  Generator& gen = at::globalContext().defaultGenerator(at::kCPU);
+  ASSERT_NO_THROW(gen.seed());
+  auto&& C = at::globalContext();
+  if (at::hasCUDA()) {
+    auto t2 = zeros({4, 4}, at::kCUDA);
     cout << &t2 << "\n";
   }
-  auto t = ones({4,4});
+  auto t = ones({4, 4});
 
-  auto wha2 = zeros({4,4}).add(t).sum();
-  CATCH_REQUIRE( wha2.item<double>() == 16.0 );
+  auto wha2 = zeros({4, 4}).add(t).sum();
+  ASSERT_EQ(wha2.item<double>(), 16.0);
 
-  CATCH_REQUIRE( t.sizes()[0] == 4 );
-  CATCH_REQUIRE( t.sizes()[1] == 4 );
-  CATCH_REQUIRE( t.strides()[0] == 4 );
-  CATCH_REQUIRE( t.strides()[1] == 1 );
+  ASSERT_EQ(t.sizes()[0], 4);
+  ASSERT_EQ(t.sizes()[1], 4);
+  ASSERT_EQ(t.strides()[0], 4);
+  ASSERT_EQ(t.strides()[1], 1);
 
-  Type & T = CPU(Float);
-  Tensor x = randn({1,10}, T);
-  Tensor prev_h = randn({1,20}, T);
-  Tensor W_h = randn({20,20}, T);
-  Tensor W_x = randn({20,10}, T);
+  Type& T = CPU(Float);
+  Tensor x = randn({1, 10}, T);
+  Tensor prev_h = randn({1, 20}, T);
+  Tensor W_h = randn({20, 20}, T);
+  Tensor W_x = randn({20, 10}, T);
   Tensor i2h = at::mm(W_x, x.t());
   Tensor h2h = at::mm(W_h, prev_h.t());
   Tensor next_h = i2h.add(h2h);
   next_h = next_h.tanh();
 
-  _CATCH_REQUIRE_THROWS(at::_local_scalar(Tensor{}));
+  ASSERT_ANY_THROW(at::_local_scalar(Tensor{}));
 
   test_overflow();
 
-  if(at::hasCUDA()) {
+  if (at::hasCUDA()) {
     auto r = CUDA(Float).copy(next_h);
-    CATCH_REQUIRE(CPU(Float).copy(r).equal(next_h));
+    ASSERT_TRUE(CPU(Float).copy(r).equal(next_h));
   }
-  CATCH_REQUIRE_NOTHROW(randn({10,10,2}, T));
+  ASSERT_NO_THROW(randn({10, 10, 2}, T));
 
   // check Scalar.toTensor on Scalars backed by different data types
-  CATCH_REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble);
-  CATCH_REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong);
-  CATCH_REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble);
+  ASSERT_EQ(scalar_to_tensor(bar).type().scalarType(), kDouble);
+  ASSERT_EQ(scalar_to_tensor(what).type().scalarType(), kLong);
+  ASSERT_EQ(
+      scalar_to_tensor(ones({})._local_scalar()).type().scalarType(), kDouble);
 
   if (x.type().scalarType() != ScalarType::Half) {
     AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] {
       scalar_t s = 1;
       std::stringstream ss;
-      CATCH_REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n");
+      ASSERT_NO_THROW(
+          ss << "hello, dispatch" << x.type().toString() << s << "\n");
       auto data = (scalar_t*)x.data_ptr();
       (void)data;
     });
@@ -115,11 +116,11 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
 
   // test direct C-scalar type conversions
   {
-    auto x = ones({1,2}, T);
-    _CATCH_REQUIRE_THROWS(x.item<float>());
+    auto x = ones({1, 2}, T);
+    ASSERT_ANY_THROW(x.item<float>());
   }
   auto float_one = ones({}, T);
-  CATCH_REQUIRE(float_one.item<float>() == 1);
-  CATCH_REQUIRE(float_one.item<int32_t>() == 1);
-  CATCH_REQUIRE((float_one.item<at::Half>() == 1));
+  ASSERT_EQ(float_one.item<float>(), 1);
+  ASSERT_EQ(float_one.item<int32_t>(), 1);
+  ASSERT_EQ(float_one.item<at::Half>(), 1);
 }
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
index 8dc015dd1d06ae..5b6ec01a1577c4 100644
--- a/aten/src/ATen/test/stream_test.cpp
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAGuard.h"
@@ -11,12 +10,23 @@
 #include <thread>
 #include <unordered_set>
 
+#define ASSERT_EQ_CUDA(X, Y) \
+  {                          \
+    bool isTRUE = X == Y;    \
+    ASSERT_TRUE(isTRUE);     \
+  }
+
+#define ASSERT_NE_CUDA(X, Y) \
+  {                          \
+    bool isFALSE = X == Y;   \
+    ASSERT_FALSE(isFALSE);   \
+  }
+
 /*
-Tests related to ATen streams.
-*/
-CATCH_TEST_CASE(
-    "Copying and Moving Streams",
-    "Verifies streams are live through copying and moving") {
+   Tests related to ATen streams.
+   */
+// Verifies streams are live through copying and moving
+TEST(TestStream, CopyAndMoveTest) {
   int32_t device = -1;
   cudaStream_t cuda_stream;
 
@@ -29,14 +39,14 @@ CATCH_TEST_CASE(
 
     copyStream = s;
 
-    CATCH_REQUIRE(copyStream.internals() == s.internals());
-    CATCH_REQUIRE(copyStream.device() == device);
-    CATCH_REQUIRE(copyStream.stream() == cuda_stream);
+    ASSERT_EQ_CUDA(copyStream.internals(), s.internals());
+    ASSERT_EQ_CUDA(copyStream.device(), device);
+    ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream);
   }
 
-  CATCH_REQUIRE(copyStream.internals());
-  CATCH_REQUIRE(copyStream.device() == device);
-  CATCH_REQUIRE(copyStream.stream() == cuda_stream);
+  ASSERT_TRUE(copyStream.internals());
+  ASSERT_EQ_CUDA(copyStream.device(), device);
+  ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream);
 
   // Tests that moving works as expected and preserves the stream
   at::cuda::CUDAStream moveStream;
@@ -47,43 +57,43 @@ CATCH_TEST_CASE(
 
     moveStream = std::move(s);
 
-    CATCH_REQUIRE(moveStream.device() == device);
-    CATCH_REQUIRE(moveStream.stream() == cuda_stream);
+    ASSERT_EQ_CUDA(moveStream.device(), device);
+    ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream);
   }
 
-  CATCH_REQUIRE(moveStream.internals());
-  CATCH_REQUIRE(moveStream.device() == device);
-  CATCH_REQUIRE(moveStream.stream() == cuda_stream);
+  ASSERT_TRUE(moveStream.internals());
+  ASSERT_EQ_CUDA(moveStream.device(), device);
+  ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream);
 }
 
-CATCH_TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
+// Verifies streams are set properly
+TEST(TestStream, GetAndSetTest) {
   at::cuda::CUDAStream myStream = at::cuda::createCUDAStream();
 
   // Sets and gets
   at::cuda::setCurrentCUDAStream(myStream);
   at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream();
 
-  CATCH_REQUIRE(myStream == curStream);
+  ASSERT_EQ_CUDA(myStream, curStream);
 
   // Gets, sets, and gets default stream
   at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
   at::cuda::setCurrentCUDAStream(defaultStream);
   curStream = at::cuda::getCurrentCUDAStream();
 
-  CATCH_REQUIRE(defaultStream != myStream);
-  CATCH_REQUIRE(curStream == defaultStream);
+  ASSERT_NE_CUDA(defaultStream, myStream);
+  ASSERT_EQ_CUDA(curStream, defaultStream);
 }
 
 void thread_fun(at::cuda::CUDAStream& cur_thread_stream) {
   auto new_stream = at::cuda::createCUDAStream();
   at::cuda::setCurrentCUDAStream(new_stream);
   cur_thread_stream = at::cuda::getCurrentCUDAStream();
-  CATCH_REQUIRE(cur_thread_stream == new_stream);
+  ASSERT_EQ_CUDA(cur_thread_stream, new_stream);
 }
 
-CATCH_TEST_CASE(
-    "Multithread Getting and Setting",
-    "Ensures streams are thread local") {
+// Ensures streams are thread local
+TEST(TestStream, MultithreadGetAndSetTest) {
   at::cuda::CUDAStream s0, s1;
 
   std::thread t0{thread_fun, std::ref(s0)};
@@ -94,25 +104,25 @@ CATCH_TEST_CASE(
   at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream();
   at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream();
 
-  CATCH_REQUIRE(cur_stream == default_stream);
-  CATCH_REQUIRE(cur_stream != s0);
-  CATCH_REQUIRE(cur_stream != s1);
-  CATCH_REQUIRE(s0 != s1);
+  ASSERT_EQ_CUDA(cur_stream, default_stream);
+  ASSERT_NE_CUDA(cur_stream, s0);
+  ASSERT_NE_CUDA(cur_stream, s1);
+  ASSERT_NE_CUDA(s0, s1);
 }
 
-CATCH_TEST_CASE("CUDAGuard") {
+// CUDA Guard
+TEST(TestStream, CUDAGuardTest) {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
 
   // -- begin setup
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
   std::vector<at::cuda::CUDAStream> streams0 = {
-      at::cuda::getDefaultCUDAStream(),
-      at::cuda::createCUDAStream()};
-  CATCH_REQUIRE(streams0[0].device() == 0);
-  CATCH_REQUIRE(streams0[1].device() == 0);
+      at::cuda::getDefaultCUDAStream(), at::cuda::createCUDAStream()};
+  ASSERT_EQ_CUDA(streams0[0].device(), 0);
+  ASSERT_EQ_CUDA(streams0[1].device(), 0);
   at::cuda::setCurrentCUDAStream(streams0[0]);
 
   std::vector<at::cuda::CUDAStream> streams1;
@@ -121,47 +131,46 @@ CATCH_TEST_CASE("CUDAGuard") {
     streams1.push_back(at::cuda::getDefaultCUDAStream());
     streams1.push_back(at::cuda::createCUDAStream());
   }
-  CATCH_REQUIRE(streams1[0].device() == 1);
-  CATCH_REQUIRE(streams1[1].device() == 1);
+  ASSERT_EQ_CUDA(streams1[0].device(), 1);
+  ASSERT_EQ_CUDA(streams1[1].device(), 1);
   at::cuda::setCurrentCUDAStream(streams1[0]);
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
 
   // -- end setup
 
   // Test that all original streams are recorded.
   {
     at::cuda::CUDAGuard guard;
-    CATCH_REQUIRE(guard.original_streams().empty());
+    ASSERT_TRUE(guard.original_streams().empty());
     guard.set_stream(streams0[0]);
-    CATCH_REQUIRE(
-        guard.original_streams().size() == at::cuda::getNumGPUs());
-    CATCH_REQUIRE(guard.original_streams()[0] == streams0[0]);
-    CATCH_REQUIRE(guard.original_streams()[1] == streams1[0]);
+    ASSERT_EQ_CUDA(guard.original_streams().size(), at::cuda::getNumGPUs());
+    ASSERT_EQ_CUDA(guard.original_streams()[0], streams0[0]);
+    ASSERT_EQ_CUDA(guard.original_streams()[1], streams1[0]);
   }
 
   // Setting a stream changes the current device and the stream on that device
   {
     at::cuda::CUDAGuard guard(streams1[1]);
-    CATCH_REQUIRE(guard.last_device() == 1);
-    CATCH_REQUIRE(at::cuda::current_device() == 1);
-    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]);
+    ASSERT_EQ_CUDA(guard.last_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[1]);
   }
 
   // Device and stream are now reset
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
 
   // Setting only the device changes only the current device and not the stream
   {
     at::cuda::CUDAGuard guard(/*device=*/1);
-    CATCH_REQUIRE(guard.last_device() == 1);
-    CATCH_REQUIRE(at::cuda::current_device() == 1);
-    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+    ASSERT_EQ_CUDA(guard.last_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
   }
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(0), streams0[0]);
 
   // Setting the stream first, and then the device, first changes the devices
   // back, and then resets the stream on the initial device.
@@ -171,12 +180,13 @@ CATCH_TEST_CASE("CUDAGuard") {
     guard.set_device(1);
   }
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(0), streams0[0]);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
 }
 
-CATCH_TEST_CASE("CUDAGuardIsMovable") {
+// CUDAGuardIsMovable
+TEST(TestStream, CUDAGuardMovableTest) {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
@@ -185,17 +195,18 @@ CATCH_TEST_CASE("CUDAGuardIsMovable") {
   at::cuda::CUDAGuard first(stream);
   first.set_device(1);
   at::cuda::CUDAGuard second(std::move(first));
-  CATCH_REQUIRE(second.original_streams().size() == device_count);
-  CATCH_REQUIRE(second.original_device() == 0);
-  CATCH_REQUIRE(second.last_device() == 1);
+  ASSERT_EQ_CUDA(second.original_streams().size(), device_count);
+  ASSERT_EQ_CUDA(second.original_device(), 0);
+  ASSERT_EQ_CUDA(second.last_device(), 1);
   at::cuda::CUDAGuard third;
   third = std::move(second);
-  CATCH_REQUIRE(third.original_streams().size() == device_count);
-  CATCH_REQUIRE(third.original_device() == 0);
-  CATCH_REQUIRE(third.last_device() == 1);
+  ASSERT_EQ_CUDA(third.original_streams().size(), device_count);
+  ASSERT_EQ_CUDA(third.original_device(), 0);
+  ASSERT_EQ_CUDA(third.last_device(), 1);
 }
 
-CATCH_TEST_CASE("Streampool Round Robin") {
+// Streampool Round Robin
+TEST(TestStream, StreamPoolTest) {
   std::vector<at::cuda::CUDAStream> streams{};
   for (int i = 0; i < 200; ++i) {
     streams.emplace_back(at::cuda::detail::CUDAStream_createStream());
@@ -206,14 +217,17 @@ CATCH_TEST_CASE("Streampool Round Robin") {
   for (auto i = decltype(streams.size()){0}; i < streams.size(); ++i) {
     cudaStream_t cuda_stream = streams[i];
     auto result_pair = stream_set.insert(cuda_stream);
-    if (!result_pair.second) hasDuplicates = true;
+    if (!result_pair.second)
+      hasDuplicates = true;
   }
 
-  CATCH_REQUIRE(hasDuplicates);
+  ASSERT_TRUE(hasDuplicates);
 }
 
-CATCH_TEST_CASE("Multi-GPU") {
-  if (at::cuda::getNumGPUs() < 2) return;
+// Multi-GPU
+TEST(TestStream, MultiGPUTest) {
+  if (at::cuda::getNumGPUs() < 2)
+    return;
 
   at::cuda::CUDAStream s0 = at::cuda::createCUDAStream(true, 0);
   at::cuda::CUDAStream s1 = at::cuda::createCUDAStream(false, 1);
@@ -221,17 +235,18 @@ CATCH_TEST_CASE("Multi-GPU") {
   at::cuda::setCurrentCUDAStream(s0);
   at::cuda::setCurrentCUDAStream(s1);
 
-  CATCH_REQUIRE(s0 == at::cuda::getCurrentCUDAStream());
+  ASSERT_EQ_CUDA(s0, at::cuda::getCurrentCUDAStream());
 
   at::DeviceGuard device_guard{1};
-  CATCH_REQUIRE(s1 == at::cuda::getCurrentCUDAStream());
+  ASSERT_EQ_CUDA(s1, at::cuda::getCurrentCUDAStream());
 }
 
-CATCH_TEST_CASE("CUDAEvent Syncs") {
+// CUDAEvent Syncs
+TEST(TestStream, CUDAEventSyncTest) {
   const auto stream = at::cuda::createCUDAStream();
   at::cuda::CUDAEvent event;
 
-  CATCH_REQUIRE(!event.happened());
+  ASSERT_FALSE(event.happened());
 
   event.recordOnce(stream);
 
@@ -242,11 +257,13 @@ CATCH_TEST_CASE("CUDAEvent Syncs") {
   wait_stream1.synchronize_with(event);
 
   cudaStreamSynchronize(wait_stream0);
-  CATCH_REQUIRE(event.happened());
+  ASSERT_TRUE(event.happened());
 }
 
-CATCH_TEST_CASE("Cross-Device Events") {
-  if (at::cuda::getNumGPUs() < 2) return;
+// Cross-Device Events
+TEST(TestStream, CrossDeviceTest) {
+  if (at::cuda::getNumGPUs() < 2)
+    return;
 
   const auto stream0 = at::cuda::createCUDAStream();
   at::cuda::CUDAEvent event0;
@@ -257,13 +274,13 @@ CATCH_TEST_CASE("Cross-Device Events") {
 
   event0.record(stream0);
   event1.record(stream1);
-  
+
   event0 = std::move(event1);
-  
-  CATCH_REQUIRE(event0.device() == 1);
+
+  ASSERT_EQ_CUDA(event0.device(), 1);
 
   stream0.synchronize_with(event0);
-  
+
   cudaStreamSynchronize(stream0);
-  CATCH_REQUIRE(event0.happened());
+  ASSERT_TRUE(event0.happened());
 }
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
index 81701733b53693..99421ca225a361 100644
--- a/aten/src/ATen/test/test_parallel.cpp
+++ b/aten/src/ATen/test/test_parallel.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -11,12 +10,11 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "parallel", "[cpu]" ) {
-
+TEST(TestParallel, TestParallel) {
   manual_seed(123, at::kCPU);
   set_num_threads(1);
 
-  Tensor a = rand({1,3});
+  Tensor a = rand({1, 3});
   a[0][0] = 1;
   a[0][1] = 0;
   a[0][2] = 0;
@@ -24,5 +22,5 @@ CATCH_TEST_CASE( "parallel", "[cpu]" ) {
   as[0] = 1;
   as[1] = 0;
   as[2] = 0;
-  CATCH_REQUIRE(a.sum(0).equal(as));
+  ASSERT_TRUE(a.sum(0).equal(as));
 }
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
index c01dff2d0038b1..8518c4f4358365 100644
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/core/UndefinedTensorImpl.h"
@@ -8,7 +7,7 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "undefined tensor test", "[]" ) {
+TEST(TestUndefined, UndefinedTest) {
   manual_seed(123, at::kCPU);
 
   // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
@@ -17,36 +16,36 @@ CATCH_TEST_CASE( "undefined tensor test", "[]" ) {
 
   std::stringstream ss;
   ss << und << std::endl;
-  CATCH_REQUIRE(!und.defined());
-  CATCH_REQUIRE(std::string("UndefinedType") == und.toString());
-
-  _CATCH_REQUIRE_THROWS(und.strides());
-  _CATCH_REQUIRE_THROWS(und.dim());
-  _CATCH_REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5));
-  _CATCH_REQUIRE_THROWS(und.add(und));
-  _CATCH_REQUIRE_THROWS(und.add(ft));
-  _CATCH_REQUIRE_THROWS(ft.add(und));
-  _CATCH_REQUIRE_THROWS(und.add(5));
-  _CATCH_REQUIRE_THROWS(und.mm(und));
+  ASSERT_FALSE(und.defined());
+  ASSERT_EQ(std::string("UndefinedType"), und.toString());
+
+  ASSERT_ANY_THROW(und.strides());
+  ASSERT_ANY_THROW(und.dim());
+  ASSERT_ANY_THROW([]() { return Tensor(); }() = Scalar(5));
+  ASSERT_ANY_THROW(und.add(und));
+  ASSERT_ANY_THROW(und.add(ft));
+  ASSERT_ANY_THROW(ft.add(und));
+  ASSERT_ANY_THROW(und.add(5));
+  ASSERT_ANY_THROW(und.mm(und));
 
   und.toType(und.type());
-  _CATCH_REQUIRE_THROWS(und.toType(ft.type()));
-  _CATCH_REQUIRE_THROWS(ft.toType(und.type()));
+  ASSERT_ANY_THROW(und.toType(ft.type()));
+  ASSERT_ANY_THROW(ft.toType(und.type()));
   und.toType(ScalarType::Undefined);
-  _CATCH_REQUIRE_THROWS(und.toType(ScalarType::Float));
-  _CATCH_REQUIRE_THROWS(ft.toType(ScalarType::Undefined));
+  ASSERT_ANY_THROW(und.toType(ScalarType::Float));
+  ASSERT_ANY_THROW(ft.toType(ScalarType::Undefined));
 
   // copy_
-  _CATCH_REQUIRE_THROWS(und.copy_(und));
-  _CATCH_REQUIRE_THROWS(und.copy_(ft));
-  _CATCH_REQUIRE_THROWS(ft.copy_(und));
+  ASSERT_ANY_THROW(und.copy_(und));
+  ASSERT_ANY_THROW(und.copy_(ft));
+  ASSERT_ANY_THROW(ft.copy_(und));
 
   und.toBackend(Backend::Undefined);
-  _CATCH_REQUIRE_THROWS(und.toBackend(Backend::CPU));
-  _CATCH_REQUIRE_THROWS(ft.toBackend(Backend::Undefined));
+  ASSERT_ANY_THROW(und.toBackend(Backend::CPU));
+  ASSERT_ANY_THROW(ft.toBackend(Backend::Undefined));
 
   Tensor to_move = ones({1}, CPU(kFloat));
   Tensor m(std::move(to_move));
-  CATCH_REQUIRE(!to_move.defined());
-  CATCH_REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton());
+  ASSERT_FALSE(to_move.defined());
+  ASSERT_EQ(to_move.unsafeGetTensorImpl(), UndefinedTensorImpl::singleton());
 }
diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp
index 42c9f61b19b5e1..3539db77d65517 100644
--- a/aten/src/ATen/test/weakref_test.cpp
+++ b/aten/src/ATen/test/weakref_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 
@@ -10,53 +9,55 @@
 using at::Tensor;
 using at::WeakTensor;
 
-CATCH_TEST_CASE( "Weak pointer tests", "" ) {
-  CATCH_SECTION("gets invalidated") {
-    Tensor a = at::ones({2, 2});
+// Weak pointer tests
+// gets invalidated
+TEST(TestWeakPointer, WeakPointerGetsInvalidated) {
+  Tensor a = at::ones({2, 2});
+  WeakTensor b = a;
+  a.reset();
+  ASSERT_FALSE(b.lock().defined());
+}
+
+// can successfully lock
+TEST(TestWeakPointer, WeakPointerLock) {
+  Tensor a = at::ones({2, 2});
+  WeakTensor b = a;
+  auto c = b.lock();
+  ASSERT_TRUE(c.defined());
+
+  a.reset();
+  ASSERT_TRUE(b.lock().defined());
+  c.reset();
+  ASSERT_FALSE(b.lock().defined());
+}
+
+// updates refcounts correctly
+TEST(TestWeakPointer, WeakUpdatesRefcountsTest) {
+  Tensor a = at::ones({2, 2});
+  ASSERT_EQ(a.use_count(), 1);
+  ASSERT_EQ(a.weak_use_count(), 1);
+  {
     WeakTensor b = a;
-    a.reset();
-    CATCH_REQUIRE_FALSE(b.lock().defined());
+    ASSERT_EQ(a.use_count(), 1);
+    ASSERT_EQ(a.weak_use_count(), 2);
   }
-
-  CATCH_SECTION("can successfully lock") {
-    Tensor a = at::ones({2, 2});
+  ASSERT_EQ(a.use_count(), 1);
+  ASSERT_EQ(a.weak_use_count(), 1);
+  {
     WeakTensor b = a;
-    auto c = b.lock();
-    CATCH_REQUIRE(c.defined());
-
-    a.reset();
-    CATCH_REQUIRE(b.lock().defined());
-    c.reset();
-    CATCH_REQUIRE_FALSE(b.lock().defined());
+    ASSERT_EQ(a.use_count(), 1);
+    auto locked = b.lock();
+    ASSERT_TRUE(locked.defined());
+    ASSERT_EQ(a.use_count(), 2);
   }
-
-  CATCH_SECTION("updates refcounts correctly") {
-    Tensor a = at::ones({2, 2});
-    CATCH_REQUIRE(a.use_count() == 1);
-    CATCH_REQUIRE(a.weak_use_count() == 1);
-    {
-      WeakTensor b = a;
-      CATCH_REQUIRE(a.use_count() == 1);
-      CATCH_REQUIRE(a.weak_use_count() == 2);
-    }
-    CATCH_REQUIRE(a.use_count() == 1);
-    CATCH_REQUIRE(a.weak_use_count() == 1);
-    {
-      WeakTensor b = a;
-      CATCH_REQUIRE(a.use_count() == 1);
-      auto locked = b.lock();
-      CATCH_REQUIRE(locked.defined());
-      CATCH_REQUIRE(a.use_count() == 2);
-    }
-    CATCH_REQUIRE(a.use_count() == 1);
-    CATCH_REQUIRE(a.weak_use_count() == 1);
-    {
-      WeakTensor b = a;
-      CATCH_REQUIRE(a.use_count() == 1);
-      CATCH_REQUIRE(a.weak_use_count() == 2);
-      a.reset();
-      CATCH_REQUIRE(b.use_count() == 0);
-      CATCH_REQUIRE(b.weak_use_count() == 1);
-    }
+  ASSERT_EQ(a.use_count(), 1);
+  ASSERT_EQ(a.weak_use_count(), 1);
+  {
+    WeakTensor b = a;
+    ASSERT_EQ(a.use_count(), 1);
+    ASSERT_EQ(a.weak_use_count(), 2);
+    a.reset();
+    ASSERT_EQ(b.use_count(), 0);
+    ASSERT_EQ(b.weak_use_count(), 1);
   }
 }
diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp
index f76dac212a0921..f08071424625b3 100644
--- a/aten/src/ATen/test/wrapdim_test.cpp
+++ b/aten/src/ATen/test/wrapdim_test.cpp
@@ -1,43 +1,45 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
+void TestSimpleCase(Type& T) {
+  auto a = randn({2, 3, 4, 5}, T);
+  ASSERT_TRUE(a.prod(-4).equal(a.prod(0)));
+  ASSERT_TRUE(a.prod(3).equal(a.prod(-1)));
+}
+
+void TestExpressionSpecification(Type& T) {
+  auto a = randn({2, 3, 4, 5}, T);
+  ASSERT_TRUE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
+  ASSERT_TRUE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
+
+  // can unsqueeze scalar
+  auto b = randn(1, T);
+  b.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
+}
+
+void TestEmptyTensor(Type& T) {
+  auto a = randn(0, T);
+  ASSERT_TRUE(a.prod(0).equal(at::ones({}, T)));
+}
+
+void TestScalarVs1Dim1Size(Type& T) {
+  auto a = randn(1, T);
+  ASSERT_TRUE(a.prod(0).equal(a.prod(-1)));
+  a.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_EQ(a.dim(), 0);
+  ASSERT_TRUE(a.prod(0).equal(a.prod(-1)));
+}
 
-CATCH_TEST_CASE( "wrapdim test", "[]" ) {
+TEST(TestWrapdim, TestWrapdim) {
   manual_seed(123, at::kCPU);
+  Type& T = CPU(kFloat);
 
-  Type & T = CPU(kFloat);
-
-  CATCH_SECTION( "simple case" ) {
-    auto a = randn({2, 3, 4, 5}, T);
-    CATCH_REQUIRE(a.prod(-4).equal(a.prod(0)));
-    CATCH_REQUIRE(a.prod(3).equal(a.prod(-1)));
-  }
-
-  CATCH_SECTION( "expression specification" ) {
-    auto a = randn({2, 3, 4, 5}, T);
-    CATCH_REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
-    CATCH_REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
-
-    // can unsqueeze scalar
-    auto b = randn(1, T);
-    b.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    CATCH_REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
-  }
-
-  CATCH_SECTION( "empty tensor" ) {
-    auto a = randn(0, T);
-    CATCH_REQUIRE(a.prod(0).equal(at::ones({}, T)));
-  }
-
-  CATCH_SECTION( "scalar vs 1-dim, 1-size" ) {
-    auto a = randn(1, T);
-    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
-    a.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    CATCH_REQUIRE(a.dim() == 0);
-    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
-  }
+  TestSimpleCase(T);
+  TestEmptyTensor(T);
+  TestScalarVs1Dim1Size(T);
+  TestExpressionSpecification(T);
 }

From 5da8a8c785201187c04693d4e0ac811d9016dc46 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 26 Sep 2018 21:29:43 -0700
Subject: [PATCH 11/82] Handle undefined tensor in blob correctly. (#12125)

Summary:
You can't GetDeviceType an undefined tensor, so test for this case
first.  This allows you to safely move tensors out of blobs.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12125

Reviewed By: smessmer

Differential Revision: D10080075

Pulled By: ezyang

fbshipit-source-id: bb99b089b6daa9d4db99015208f939d7ce4d4a79
---
 caffe2/core/blob.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index e09a54cbd2df56..06f278aac2ae86 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -21,13 +21,13 @@ inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
     return false;
   }
   const Tensor* tensor = &blob.Get<Tensor>();
-  return tensor && tensor->GetDeviceType() == device_type;
+  return tensor && *tensor && tensor->GetDeviceType() == device_type;
 }
 
 inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
   if (blob->IsType<Tensor>()) {
     Tensor* tensor = blob->GetMutable<Tensor>();
-    if (tensor->GetDeviceType() == device_type) {
+    if (*tensor && tensor->GetDeviceType() == device_type) {
       return tensor;
     }
   }

From 383d340e8896ff1a12ff64c7b9d342f56eac42e7 Mon Sep 17 00:00:00 2001
From: Jerry Ma <noreplyspamblackhole@gmail.com>
Date: Wed, 26 Sep 2018 21:33:25 -0700
Subject: [PATCH 12/82] Small optimization for adam (#12107)

Summary:
Apply weight decay for Adam in-place instead of via copy.

Synced offline with soumith , who mentioned that it should be OK. This is also consistent with other optimizers, e.g. https://github.com/pytorch/pytorch/blob/eee01731a5d33d5be58d875711bd2577e38dbddf/torch/optim/sgd.py#L93
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12107

Reviewed By: soumith

Differential Revision: D10071787

Pulled By: jma127

fbshipit-source-id: 5fd7939c79039693b225c44c4c80450923b8d673
---
 torch/optim/adam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 308ec0c8cf9150..a26de99ec02b93 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -87,7 +87,7 @@ def step(self, closure=None):
                 state['step'] += 1
 
                 if group['weight_decay'] != 0:
-                    grad = grad.add(group['weight_decay'], p.data)
+                    grad.add_(group['weight_decay'], p.data)
 
                 # Decay the first and second moment running average coefficient
                 exp_avg.mul_(beta1).add_(1 - beta1, grad)

From 9c49bb9ddf87e21ea01e3e2d582f7238e189d480 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Thu, 27 Sep 2018 03:05:23 -0700
Subject: [PATCH 13/82] Move registry fully to c10 (#12077)

Summary:
This does 6 things:

- add c10/util/Registry.h as the unified registry util
  - cleaned up some APIs such as export condition
- fully remove aten/core/registry.h
- fully remove caffe2/core/registry.h
- remove a bogus aten/registry.h
- unifying all macros
- set up registry testing in c10

Also, an important note that we used to mark the templated Registry class as EXPORT - this should not happen, because one should almost never export a template class. This PR fixes that.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12077

Reviewed By: ezyang

Differential Revision: D10050771

Pulled By: Yangqing

fbshipit-source-id: 417b249b49fed6a67956e7c6b6d22374bcee24cf
---
 .jenkins/pytorch/test.sh                      |   1 +
 aten/src/ATen/Registry.h                      |   2 -
 aten/src/ATen/core/LegacyTypeDispatch.cpp     |   5 +-
 aten/src/ATen/core/LegacyTypeDispatch.h       |   8 +-
 aten/src/ATen/core/Registry.h                 | 217 -----------------
 aten/src/ATen/core/VariableHooksInterface.cpp |   5 +-
 aten/src/ATen/core/VariableHooksInterface.h   |  12 +-
 aten/src/ATen/detail/CUDAHooksInterface.cpp   |   2 +-
 aten/src/ATen/detail/CUDAHooksInterface.h     |   7 +-
 .../src/ATen/detail/ComplexHooksInterface.cpp |   6 +-
 aten/src/ATen/detail/ComplexHooksInterface.h  |   9 +-
 aten/src/THC/THCNumerics.cuh                  |   1 +
 c10/CMakeLists.txt                            |   5 +
 c10/c10_dummy.cpp                             |   7 -
 c10/c10_dummy.h                               |   7 -
 c10/macros/Export.h                           |   7 +-
 c10/macros/Legacy.h                           |   7 -
 c10/macros/Macros.h                           |   8 +-
 c10/macros/cmake_macros.h.in                  |   7 +-
 c10/test/CMakeLists.txt                       |  15 ++
 c10/test/registry_test.cpp                    |  49 ++++
 c10/util/Registry.h                           | 226 ++++++++++++++++++
 c10/util/Type.cpp                             |  59 +++++
 c10/util/Type.h                               |  28 +++
 caffe2/core/allocator.h                       |   1 +
 caffe2/core/blob_serialization.cc             |   4 +-
 caffe2/core/blob_serializer_base.h            |  10 +-
 caffe2/core/blob_stats.h                      |   4 +-
 caffe2/core/blob_test.cc                      |   4 +-
 caffe2/core/common.h                          |   1 +
 caffe2/core/common_gpu.cc                     |   1 +
 caffe2/core/db.cc                             |   2 +-
 caffe2/core/db.h                              |   6 +-
 caffe2/core/flags.cc                          |   4 +-
 caffe2/core/flags.h                           |   5 +-
 .../core/hip/net_async_hip_thread_pool_hip.cc |   2 +-
 caffe2/core/logging.cc                        |   1 +
 caffe2/core/net.cc                            |   2 +-
 caffe2/core/net.h                             |   8 +-
 caffe2/core/net_async_base.cc                 |  11 +-
 caffe2/core/net_async_base.h                  |   9 +-
 caffe2/core/net_async_gpu_thread_pool_gpu.cc  |   2 +-
 caffe2/core/net_dag.h                         |   2 +-
 caffe2/core/net_dag_utils.h                   |   2 +-
 caffe2/core/net_simple.h                      |   2 +-
 caffe2/core/net_simple_async.h                |   2 +-
 caffe2/core/observer_test.cc                  |   2 +-
 caffe2/core/operator.cc                       |  11 +-
 caffe2/core/operator.h                        |  40 ++--
 caffe2/core/operator_c10wrapper.cc            |   2 +-
 caffe2/core/operator_c10wrapper.h             |  10 +-
 caffe2/core/operator_gradient.h               |   8 +-
 caffe2/core/operator_schema.h                 |   6 +-
 caffe2/core/registry.h                        | 207 ----------------
 caffe2/core/registry_test.cc                  |  46 ----
 caffe2/core/transform.cc                      |   2 +-
 caffe2/core/transform.h                       |   4 +-
 caffe2/core/workspace.h                       |   2 +-
 caffe2/ideep/utils/ideep_operator.h           |  10 +-
 caffe2/ideep/utils/ideep_register.cc          |   2 +-
 caffe2/mkl/mkl_operator.cc                    |   2 +-
 caffe2/mkl/utils/mkl_operator.h               |  10 +-
 .../mobile/contrib/arm-compute/core/net_gl.h  |   2 +-
 .../contrib/arm-compute/core/operator.cc      |   7 +-
 .../contrib/arm-compute/core/operator.h       |  33 +--
 .../fused_rowwise_8bit_conversion_ops.cc      |   2 +-
 .../fused_rowwise_random_quantization_ops.cc  |   2 +-
 .../lengths_reducer_fused_8bit_rowwise_ops.cc |   2 +-
 .../lengths_reducer_rowwise_8bit_ops.cc       |   2 +-
 caffe2/opt/converter.cc                       |   2 +-
 caffe2/opt/converter.h                        |   4 +-
 caffe2/opt/passes.cc                          |   8 +-
 caffe2/opt/passes.h                           |  12 +-
 caffe2/python/pybind_state.cc                 |   4 +-
 caffe2/python/pybind_state.h                  |   8 +-
 caffe2/python/pybind_state_registry.cc        |   2 +-
 caffe2/python/pybind_state_registry.h         |  23 +-
 setup.py                                      |   1 +
 78 files changed, 593 insertions(+), 670 deletions(-)
 delete mode 100644 aten/src/ATen/Registry.h
 delete mode 100644 aten/src/ATen/core/Registry.h
 delete mode 100644 c10/c10_dummy.cpp
 delete mode 100644 c10/c10_dummy.h
 delete mode 100644 c10/macros/Legacy.h
 create mode 100644 c10/test/CMakeLists.txt
 create mode 100644 c10/test/registry_test.cpp
 create mode 100644 c10/util/Registry.h
 create mode 100644 c10/util/Type.cpp
 create mode 100644 c10/util/Type.h
 delete mode 100644 caffe2/core/registry.h
 delete mode 100644 caffe2/core/registry_test.cc

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 471fd8fac1fc6e..c43e821d98daf5 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -102,6 +102,7 @@ test_aten() {
       SUDO=sudo
     fi
 
+    ${SUDO} ln -s "$TORCH_LIB_PATH"/libc10* build/bin
     ${SUDO} ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin
     ${SUDO} ln -s "$TORCH_LIB_PATH"/libnccl* build/bin
 
diff --git a/aten/src/ATen/Registry.h b/aten/src/ATen/Registry.h
deleted file mode 100644
index 9d8d8ff2ee8404..00000000000000
--- a/aten/src/ATen/Registry.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#pragma once
-#include <ATen/core/Registry.h>
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.cpp b/aten/src/ATen/core/LegacyTypeDispatch.cpp
index 6835399bfe2ca8..56c19cda3f4271 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.cpp
+++ b/aten/src/ATen/core/LegacyTypeDispatch.cpp
@@ -9,7 +9,10 @@ LegacyTypeDispatch & globalLegacyTypeDispatch() {
   return singleton;
 }
 
-AT_DEFINE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs)
+C10_DEFINE_REGISTRY(
+    LegacyTypeInitRegistry,
+    LegacyTypeInitInterface,
+    LegacyTypeInitArgs)
 
 const LegacyTypeInitInterface& getLegacyTypeInit() {
   static std::unique_ptr<LegacyTypeInitInterface> legacy_type_init;
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
index 53cedf04e4601a..5383acbb97ebf7 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -43,8 +43,12 @@ struct CAFFE2_API LegacyTypeInitInterface {
   }
 };
 struct CAFFE2_API LegacyTypeInitArgs {};
-AT_DECLARE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs);
-#define REGISTER_LEGACY_TYPE_INIT(clsname) AT_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname)
+C10_DECLARE_REGISTRY(
+    LegacyTypeInitRegistry,
+    LegacyTypeInitInterface,
+    LegacyTypeInitArgs);
+#define REGISTER_LEGACY_TYPE_INIT(clsname) \
+  C10_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname)
 
 CAFFE2_API const LegacyTypeInitInterface& getLegacyTypeInit();
 
diff --git a/aten/src/ATen/core/Registry.h b/aten/src/ATen/core/Registry.h
deleted file mode 100644
index 98a3e4a18c7258..00000000000000
--- a/aten/src/ATen/core/Registry.h
+++ /dev/null
@@ -1,217 +0,0 @@
-#pragma once
-
-/**
- * Simple registry implementation that uses static variables to
- * register object creators during program initialization time.
- */
-
-// NB: This Registry works poorly when you have other namespaces.
-// Make all macro invocations from inside the at namespace.
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <string>
-#include <vector>
-
-#include <ATen/core/ATenGeneral.h>
-#include <ATen/core/Backtrace.h>
-
-namespace at {
-
-template <typename KeyType>
-inline void PrintOffendingKey(const KeyType& /*key*/) {
-  printf("[key type printing not supported]\n");
-}
-
-template <>
-inline void PrintOffendingKey(const std::string& key) {
-  printf("Offending key: %s.\n", key.c_str());
-}
-
-/**
- * @brief A template class that allows one to register classes by keys.
- *
- * The keys are usually a std::string specifying the name, but can be anything that
- * can be used in a std::map.
- *
- * You should most likely not use the Registry class explicitly, but use the
- * helper macros below to declare specific registries as well as registering
- * objects.
- */
-template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registry {
- public:
-  typedef std::function<ObjectPtrType(Args...)> Creator;
-
-  Registry() : registry_() {}
-
-  void Register(const SrcType& key, Creator creator) {
-    // The if statement below is essentially the same as the following line:
-    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
-    //                                   << " registered twice.";
-    // However, CHECK_EQ depends on google logging, and since registration is
-    // carried out at static initialization time, we do not want to have an
-    // explicit dependency on glog's initialization function.
-    std::lock_guard<std::mutex> lock(register_mutex_);
-    if (registry_.count(key) != 0) {
-      printf("Key already registered.\n");
-      PrintOffendingKey(key);
-      std::exit(1);
-    }
-    registry_[key] = creator;
-  }
-
-  void Register(const SrcType& key, Creator creator, const std::string& help_msg) {
-    Register(key, creator);
-    help_message_[key] = help_msg;
-  }
-
-  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
-
-  ObjectPtrType Create(const SrcType& key, Args... args) {
-    if (registry_.count(key) == 0) {
-      // Returns nullptr if the key is not registered.
-      return nullptr;
-    }
-    return registry_[key](args...);
-  }
-
-  /**
-   * Returns the keys currently registered as a std::vector.
-   */
-  std::vector<SrcType> Keys() {
-    std::vector<SrcType> keys;
-    for (const auto& it : registry_) {
-      keys.push_back(it.first);
-    }
-    return keys;
-  }
-
-  const std::unordered_map<SrcType, std::string>& HelpMessage() const {
-    return help_message_;
-  }
-
-  const char* HelpMessage(const SrcType& key) const {
-    auto it = help_message_.find(key);
-    if (it == help_message_.end()) {
-      return nullptr;
-    }
-    return it->second.c_str();
-  }
-
- private:
-  std::unordered_map<SrcType, Creator> registry_;
-  std::unordered_map<SrcType, std::string> help_message_;
-  std::mutex register_mutex_;
-
-  Registry(const Registry&) = delete;
-  Registry& operator=(const Registry&) = delete;
-};
-
-template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registerer {
- public:
-  Registerer(
-      const SrcType& key,
-      Registry<SrcType, ObjectPtrType, Args...>* registry,
-      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
-      const std::string& help_msg = "") {
-    registry->Register(key, creator, help_msg);
-  }
-
-  template <class DerivedType>
-  static ObjectPtrType DefaultCreator(Args... args) {
-    // TODO(jiayq): old versions of NVCC does not handle make_unique well
-    // so we are forced to use a unique_ptr constructor here. Check if it is
-    // fine to use make_unique in the future.
-    // return make_unique<DerivedType>(args...);
-    return ObjectPtrType(new DerivedType(args...));
-  }
-};
-
-/**
- * AT_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
- * str and ending with a number that varies with the line.
- * Pretty much a copy from 'folly/Preprocessor.h'
- */
-#define AT_CONCATENATE_IMPL(s1, s2) s1##s2
-#define AT_CONCATENATE(s1, s2) AT_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __COUNTER__)
-#else
-#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __LINE__)
-#endif
-
-/**
- * AT_DECLARE_TYPED_REGISTRY is a macro that expands to a function
- * declaration, as well as creating a convenient typename for its corresponding
- * registerer.
- */
-#define AT_DECLARE_TYPED_REGISTRY(                                \
-    RegistryName, SrcType, ObjectType, PtrType, ...)              \
-  CAFFE2_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* \
-  RegistryName();                                                 \
-  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>   \
-      Registerer##RegistryName;                                   \
-  extern template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
-
-#define AT_DEFINE_TYPED_REGISTRY(                                         \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName() {    \
-    static Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* registry = \
-        new Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>();         \
-    return registry;                                                         \
-  } \
-  template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
-
-// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
-// creator with comma in its templated arguments.
-#define AT_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
-  namespace {                                                                 \
-  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key, RegistryName(), __VA_ARGS__);                                      \
-  }
-
-#define AT_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
-  namespace {                                                                 \
-  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key,                                                                    \
-      RegistryName(),                                                         \
-      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      ::at::demangle_type<__VA_ARGS__>());                                           \
-  }
-
-// AT_DECLARE_REGISTRY and AT_DEFINE_REGISTRY are hard-wired to use std::string
-// as the key
-// type, because that is the most commonly used cases.
-#define AT_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DECLARE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
-
-#define AT_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DEFINE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
-
-#define AT_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DECLARE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
-
-#define AT_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DEFINE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
-
-// AT_REGISTER_CREATOR and AT_REGISTER_CLASS are hard-wired to use std::string
-// as the key
-// type, because that is the most commonly used cases.
-#define AT_REGISTER_CREATOR(RegistryName, key, ...) \
-  AT_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
-
-#define AT_REGISTER_CLASS(RegistryName, key, ...) \
-  AT_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
-
-}  // namespace at
diff --git a/aten/src/ATen/core/VariableHooksInterface.cpp b/aten/src/ATen/core/VariableHooksInterface.cpp
index 3728114492e53b..b9d90f56b8683b 100644
--- a/aten/src/ATen/core/VariableHooksInterface.cpp
+++ b/aten/src/ATen/core/VariableHooksInterface.cpp
@@ -24,6 +24,9 @@ namespace detail {
 
 }
 
-AT_DEFINE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
+C10_DEFINE_REGISTRY(
+    VariableHooksRegistry,
+    VariableHooksInterface,
+    VariableHooksArgs)
 
 } // namespace at::detail
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index e8fd4da9e27536..0b8eb1532c1bc6 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <ATen/core/Registry.h>
-#include <ATen/core/ScalarType.h>
 #include <ATen/core/Backend.h>
+#include <ATen/core/ScalarType.h>
+#include "c10/util/Registry.h"
 
 namespace at {
   class LegacyTypeDispatch;
@@ -39,8 +39,12 @@ struct CAFFE2_API VariableHooksInterface {
 // for the "..." in a variadic macro"
 struct CAFFE2_API VariableHooksArgs {};
 
-AT_DECLARE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
-#define REGISTER_VARIABLE_HOOKS(clsname) AT_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname)
+C10_DECLARE_REGISTRY(
+    VariableHooksRegistry,
+    VariableHooksInterface,
+    VariableHooksArgs);
+#define REGISTER_VARIABLE_HOOKS(clsname) \
+  C10_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname)
 
 namespace detail {
 CAFFE2_API const VariableHooksInterface& getVariableHooks();
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp
index ec2ac11f305dcf..f3299b34cb7f9b 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.cpp
+++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp
@@ -54,6 +54,6 @@ const CUDAHooksInterface& getCUDAHooks() {
 }
 } // namespace detail
 
-AT_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
+C10_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
 
 } // namespace at
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 69149932ac7b98..b8cff1a7aa125f 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -2,9 +2,10 @@
 
 #include <ATen/Allocator.h>
 #include <ATen/core/Generator.h>
-#include <ATen/core/Registry.h>
 #include <ATen/core/Error.h>
 
+#include "c10/util/Registry.h"
+
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -131,9 +132,9 @@ struct CAFFE2_API CUDAHooksInterface {
 // for the "..." in a variadic macro"
 struct CAFFE2_API CUDAHooksArgs {};
 
-AT_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
+C10_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs);
 #define REGISTER_CUDA_HOOKS(clsname) \
-  AT_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
 
 namespace detail {
 CAFFE2_API const CUDAHooksInterface& getCUDAHooks();
diff --git a/aten/src/ATen/detail/ComplexHooksInterface.cpp b/aten/src/ATen/detail/ComplexHooksInterface.cpp
index 9755e288ff5fe7..a7ffcf1d625f2b 100644
--- a/aten/src/ATen/detail/ComplexHooksInterface.cpp
+++ b/aten/src/ATen/detail/ComplexHooksInterface.cpp
@@ -20,6 +20,8 @@ const ComplexHooksInterface& getComplexHooks() {
 }
 } // namespace detail
 
-AT_DEFINE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs)
-
+C10_DEFINE_REGISTRY(
+    ComplexHooksRegistry,
+    ComplexHooksInterface,
+    ComplexHooksArgs)
 }
diff --git a/aten/src/ATen/detail/ComplexHooksInterface.h b/aten/src/ATen/detail/ComplexHooksInterface.h
index e5d5c3ec2a83fa..ec2995a498dc19 100644
--- a/aten/src/ATen/detail/ComplexHooksInterface.h
+++ b/aten/src/ATen/detail/ComplexHooksInterface.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/Registry.h>
 #include <ATen/Error.h>
+#include "c10/util/Registry.h"
 
 namespace at {
 
@@ -16,9 +16,12 @@ struct CAFFE2_API ComplexHooksInterface {
 };
 
 struct CAFFE2_API ComplexHooksArgs {};
-AT_DECLARE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs)
+C10_DECLARE_REGISTRY(
+    ComplexHooksRegistry,
+    ComplexHooksInterface,
+    ComplexHooksArgs);
 #define REGISTER_COMPLEX_HOOKS(clsname) \
-  AT_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname)
 
 namespace detail {
 CAFFE2_API const ComplexHooksInterface& getComplexHooks();
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 157a324f6e45b8..27ec95adbaa82e 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -1,6 +1,7 @@
 #ifndef THC_NUMERICS_INC
 #define THC_NUMERICS_INC
 
+#include <cstdlib>
 #include <limits>
 #include <cuda.h>
 #include <assert.h>
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 4b7bab4f42eeb9..490baa56a8acee 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -16,6 +16,9 @@ configure_file(
 # check with the core PyTorch developers as the dependendency will be
 # transitively passed on to all libraries dependent on PyTorch.
 file(GLOB_RECURSE C10_SRCS *.cpp)
+# exclude test files
+file(GLOB_RECURSE C10_ALL_TEST_FILES test/*.cpp)
+exclude(C10_SRCS "${C10_SRCS}" ${C10_ALL_TEST_FILES})
 file(GLOB_RECURSE C10_HEADERS *.h)
 add_library(c10 ${C10_SRCS} ${C10_HEADERS})
 # If building shared library, set dllimport/dllexport proper.
@@ -31,6 +34,8 @@ target_include_directories(
     $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
     $<INSTALL_INTERFACE:include>)
 
+add_subdirectory(test)
+
 # ---[ Installation
 # Note: for now, we will put all export path into one single Caffe2Targets group
 # to deal with the cmake deployment need. Inside the Caffe2Targets set, the
diff --git a/c10/c10_dummy.cpp b/c10/c10_dummy.cpp
deleted file mode 100644
index df4e73171da3ff..00000000000000
--- a/c10/c10_dummy.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "c10/c10_dummy.h"
-
-namespace c10 {
-bool HasC10() {
-  return true;
-}
-} // namespace c10
diff --git a/c10/c10_dummy.h b/c10/c10_dummy.h
deleted file mode 100644
index cf6c6b30c14bbf..00000000000000
--- a/c10/c10_dummy.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#pragma once
-
-#include "c10/macros/Macros.h"
-
-namespace c10 {
-C10_API bool HasC10();
-}
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 8e593e0100bbf9..4527150c8f6803 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -1,3 +1,6 @@
+#ifndef C10_MACROS_EXPORT_H_
+#define C10_MACROS_EXPORT_H_
+
 /* Header file to define the common scaffolding for exported symbols.
  *
  * Export is by itself a quite tricky situation to deal with, and if you are
@@ -9,8 +12,6 @@
  * Do NOT include this file directly. Instead, use c10/macros/Macros.h
  */
 
-#pragma once
-
 // You do not need to edit this part of file unless you are changing the core
 // pytorch export abstractions.
 //
@@ -74,3 +75,5 @@
 #else
 #define CAFFE2_API C10_IMPORT
 #endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/c10/macros/Legacy.h b/c10/macros/Legacy.h
deleted file mode 100644
index 86752a838acd32..00000000000000
--- a/c10/macros/Legacy.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* A centralized location to provide legacy macro support, and a warning about
- * when this legacy compatibility symbol is going to removed in the future.
- *
- * Do NOT include this file directly. Instead, use c10/macros/Macros.h
- */
-
-#pragma once
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 2b438d670f00de..ad9fafd4ab8f55 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -1,11 +1,12 @@
+#ifndef C10_MACROS_MACROS_H_
+#define C10_MACROS_MACROS_H_
+
 /* Main entry for c10/macros.
  *
  * In your code, include c10/macros/Macros.h directly, instead of individual
  * files in this folder.
  */
 
-#pragma once
-
 // For build systems that do not directly depend on CMake and directly build
 // from the source directory (such as Buck), one may not have a cmake_macros.h
 // file at all. In this case, the build system is responsible for providing
@@ -28,5 +29,4 @@
   classname(const classname&) = delete;        \
   classname& operator=(const classname&) = delete
 
-// Finally, file that provides legacy support for macros
-#include "c10/macros/Legacy.h"
+#endif // C10_MACROS_MACROS_H_
diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in
index 73bc803f063551..c211c54bdd7af6 100644
--- a/c10/macros/cmake_macros.h.in
+++ b/c10/macros/cmake_macros.h.in
@@ -1,6 +1,9 @@
+#ifndef C10_MACROS_CMAKE_MACROS_H_
+#define C10_MACROS_CMAKE_MACROS_H_
+
 // Automatically generated header file for the C10 library.
 // Do not include this file directly. Instead, include c10/macros/Macros.h.
 
-#pragma once
-
 #cmakedefine C10_BUILD_SHARED_LIBS
+
+#endif // C10_MACROS_CMAKE_MACROS_H_
diff --git a/c10/test/CMakeLists.txt b/c10/test/CMakeLists.txt
new file mode 100644
index 00000000000000..a2a29f59eb5bd8
--- /dev/null
+++ b/c10/test/CMakeLists.txt
@@ -0,0 +1,15 @@
+# ---[ Test binaries.
+
+file(GLOB C10_ALL_TEST_FILES *.cpp)
+if (BUILD_TEST)
+  foreach(test_src ${C10_ALL_TEST_FILES})
+    get_filename_component(test_file_name ${test_src} NAME_WE)
+    set(test_name "c10_${test_file_name}")
+    add_executable(${test_name} "${test_src}")
+    target_link_libraries(${test_name} c10 gtest_main)
+    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+    if (INSTALL_TEST)
+      install(TARGETS ${test_name} DESTINATION test)
+    endif()
+  endforeach()
+endif()
diff --git a/c10/test/registry_test.cpp b/c10/test/registry_test.cpp
new file mode 100644
index 00000000000000..c6e7f620e602b5
--- /dev/null
+++ b/c10/test/registry_test.cpp
@@ -0,0 +1,49 @@
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+
+#include "c10/util/Registry.h"
+
+// Note: we use a different namespace to test if the macros defined in
+// Registry.h actuall works with a different namespace from c10.
+namespace c10_test {
+
+class Foo {
+ public:
+  explicit Foo(int x) {
+    // LOG(INFO) << "Foo " << x;
+  }
+};
+
+C10_DECLARE_REGISTRY(FooRegistry, Foo, int);
+C10_DEFINE_REGISTRY(FooRegistry, Foo, int);
+#define REGISTER_FOO(clsname) C10_REGISTER_CLASS(FooRegistry, clsname, clsname)
+
+class Bar : public Foo {
+ public:
+  explicit Bar(int x) : Foo(x) {
+    // LOG(INFO) << "Bar " << x;
+  }
+};
+REGISTER_FOO(Bar);
+
+class AnotherBar : public Foo {
+ public:
+  explicit AnotherBar(int x) : Foo(x) {
+    // LOG(INFO) << "AnotherBar " << x;
+  }
+};
+REGISTER_FOO(AnotherBar);
+
+TEST(RegistryTest, CanRunCreator) {
+  std::unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
+  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
+  std::unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
+  EXPECT_TRUE(another_bar != nullptr);
+}
+
+TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
+  EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
+}
+
+} // namespace c10_test
diff --git a/c10/util/Registry.h b/c10/util/Registry.h
new file mode 100644
index 00000000000000..9f310c73483263
--- /dev/null
+++ b/c10/util/Registry.h
@@ -0,0 +1,226 @@
+#ifndef C10_UTIL_REGISTRY_H_
+#define C10_UTIL_REGISTRY_H_
+
+/**
+ * Simple registry implementation that uses static variables to
+ * register object creators during program initialization time.
+ */
+
+// NB: This Registry works poorly when you have other namespaces.
+// Make all macro invocations from inside the at namespace.
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "c10/util/Type.h"
+
+namespace c10 {
+
+template <typename KeyType>
+inline void PrintOffendingKey(const KeyType& /*key*/) {
+  printf("[key type printing not supported]\n");
+}
+
+template <>
+inline void PrintOffendingKey(const std::string& key) {
+  printf("Offending key: %s.\n", key.c_str());
+}
+
+/**
+ * @brief A template class that allows one to register classes by keys.
+ *
+ * The keys are usually a std::string specifying the name, but can be anything
+ * that can be used in a std::map.
+ *
+ * You should most likely not use the Registry class explicitly, but use the
+ * helper macros below to declare specific registries as well as registering
+ * objects.
+ */
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registry {
+ public:
+  typedef std::function<ObjectPtrType(Args...)> Creator;
+
+  Registry() : registry_() {}
+
+  void Register(const SrcType& key, Creator creator) {
+    std::lock_guard<std::mutex> lock(register_mutex_);
+    // The if statement below is essentially the same as the following line:
+    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, CHECK_EQ depends on google logging, and since registration is
+    // carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    if (registry_.count(key) != 0) {
+      printf("Key already registered.\n");
+      PrintOffendingKey(key);
+      std::exit(1);
+    }
+    registry_[key] = creator;
+  }
+
+  void Register(
+      const SrcType& key,
+      Creator creator,
+      const std::string& help_msg) {
+    Register(key, creator);
+    help_message_[key] = help_msg;
+  }
+
+  inline bool Has(const SrcType& key) {
+    return (registry_.count(key) != 0);
+  }
+
+  ObjectPtrType Create(const SrcType& key, Args... args) {
+    if (registry_.count(key) == 0) {
+      // Returns nullptr if the key is not registered.
+      return nullptr;
+    }
+    return registry_[key](args...);
+  }
+
+  /**
+   * Returns the keys currently registered as a std::vector.
+   */
+  std::vector<SrcType> Keys() const {
+    std::vector<SrcType> keys;
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    return keys;
+  }
+
+  inline const std::unordered_map<SrcType, std::string>& HelpMessage() const {
+    return help_message_;
+  }
+
+  const char* HelpMessage(const SrcType& key) const {
+    auto it = help_message_.find(key);
+    if (it == help_message_.end()) {
+      return nullptr;
+    }
+    return it->second.c_str();
+  }
+
+ private:
+  std::unordered_map<SrcType, Creator> registry_;
+  std::unordered_map<SrcType, std::string> help_message_;
+  std::mutex register_mutex_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(Registry);
+};
+
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registerer {
+ public:
+  Registerer(
+      const SrcType& key,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg);
+  }
+
+  template <class DerivedType>
+  static ObjectPtrType DefaultCreator(Args... args) {
+    return ObjectPtrType(new DerivedType(args...));
+  }
+};
+
+/**
+ * C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ */
+#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
+#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
+#ifdef __COUNTER__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
+#else
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
+#endif
+
+/**
+ * C10_DECLARE_TYPED_REGISTRY is a macro that expands to a function
+ * declaration, as well as creating a convenient typename for its corresponding
+ * registerer.
+ */
+// Note on C10_IMPORT and C10_EXPORT below: we need to explicitly mark DECLARE
+// as import and DEFINE as export, because these registry macros will be used
+// in downstream shared libraries as well, and one cannot use *_API - the API
+// macro will be defined on a per-shared-library basis. Semantically, when one
+// declares a typed registry it is always going to be IMPORT, and when one
+// defines a registry (which should happen ONLY ONCE and ONLY IN SOURCE FILE),
+// the instantiation unit is always going to be exported.
+//
+// The only unique condition is when in the same file one does DECLARE and
+// DEFINE - in Windows compilers, this generates a warning that dllimport and
+// dllexport are mixed, but the warning is fine and linker will be properly
+// exporting the symbol. Same thing happens in the gflags flag declaration and
+// definition caes.
+#define C10_DECLARE_TYPED_REGISTRY(                                        \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                       \
+  C10_IMPORT ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName();                                                          \
+  typedef ::c10::Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>   \
+      Registerer##RegistryName
+
+#define C10_DEFINE_TYPED_REGISTRY(                                         \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                       \
+  C10_EXPORT ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName() {                                                         \
+    static ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*   \
+        registry = new ::c10::                                             \
+            Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();       \
+    return registry;                                                       \
+  }
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define C10_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, RegistryName(), ##__VA_ARGS__);
+
+#define C10_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                  \
+      RegistryName(),                                                       \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
+      ::c10::demangle_type<__VA_ARGS__>());
+
+// C10_DECLARE_REGISTRY and C10_DEFINE_REGISTRY are hard-wired to use
+// std::string as the key type, because that is the most commonly used cases.
+#define C10_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DEFINE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define C10_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DEFINE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+// C10_REGISTER_CREATOR and C10_REGISTER_CLASS are hard-wired to use std::string
+// as the key
+// type, because that is the most commonly used cases.
+#define C10_REGISTER_CREATOR(RegistryName, key, ...) \
+  C10_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+
+#define C10_REGISTER_CLASS(RegistryName, key, ...) \
+  C10_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+
+} // namespace c10
+
+#endif // C10_UTIL_REGISTRY_H_
diff --git a/c10/util/Type.cpp b/c10/util/Type.cpp
new file mode 100644
index 00000000000000..3e00055c699104
--- /dev/null
+++ b/c10/util/Type.cpp
@@ -0,0 +1,59 @@
+#include "c10/util/Type.h"
+
+#include <cstdlib>
+#include <functional>
+#include <memory>
+
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define HAS_DEMANGLE 0
+#elif defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
+#define HAS_DEMANGLE 0
+#else
+#define HAS_DEMANGLE 1
+#endif
+
+#if HAS_DEMANGLE
+
+#include <cxxabi.h>
+#include <execinfo.h>
+
+namespace c10 {
+
+std::string demangle(const char* name) {
+  int status = -1;
+
+  // This function will demangle the mangled function name into a more human
+  // readable format, e.g. _Z1gv -> g().
+  // More information:
+  // https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/libsupc%2B%2B/cxxabi.h
+  // NOTE: `__cxa_demangle` returns a malloc'd string that we have to free
+  // ourselves.
+  std::unique_ptr<char, std::function<void(char*)>> demangled(
+      abi::__cxa_demangle(
+          name,
+          /*__output_buffer=*/nullptr,
+          /*__length=*/0,
+          &status),
+      /*deleter=*/free);
+
+  // Demangling may fail, for example when the name does not follow the
+  // standard C++ (Itanium ABI) mangling scheme. This is the case for `main`
+  // or `clone` for example, so the mangled name is a fine default.
+  if (status == 0) {
+    return demangled.get();
+  } else {
+    return name;
+  }
+}
+
+} // namespace c10
+
+#else // HAS_DEMANGLE
+namespace c10 {
+std::string demangle(const char* name) {
+  return std::string(name);
+}
+} // namespace c10
+
+#endif // HAS_DEMANGLE
diff --git a/c10/util/Type.h b/c10/util/Type.h
new file mode 100644
index 00000000000000..ddaa0c258753a7
--- /dev/null
+++ b/c10/util/Type.h
@@ -0,0 +1,28 @@
+#ifndef C10_UTIL_TYPE_H_
+#define C10_UTIL_TYPE_H_
+
+#include <cstddef>
+#include <string>
+#include <typeinfo>
+
+#include "c10/macros/Macros.h"
+
+namespace c10 {
+
+/// Utility to demangle a C++ symbol name.
+C10_API std::string demangle(const char* name);
+
+/// Returns the printable name of the type.
+template <typename T>
+inline const char* demangle_type() {
+#ifdef __GXX_RTTI
+  static const std::string name = demangle(typeid(T).name());
+  return name.c_str();
+#else // __GXX_RTTI
+  return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
+}
+
+} // namespace c10
+
+#endif // C10_UTIL_TYPE_H_
diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h
index 96bc720ccd59d1..aa41595ae06b66 100644
--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@@ -1,6 +1,7 @@
 #ifndef CAFFE2_CORE_ALLOCATOR_H_
 #define CAFFE2_CORE_ALLOCATOR_H_
 
+#include <cstring>
 #include <unordered_map>
 
 #include "caffe2/core/logging.h"
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index d4ef19db69ce4f..a60fa12a49b6af 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -322,13 +322,13 @@ void TensorSerializer::StoreDeviceDetail(
   input.ExtractDeviceOption(proto->mutable_device_detail());
 }
 // The actual serialization registry objects.
-CAFFE_DEFINE_TYPED_REGISTRY(
+C10_DEFINE_TYPED_REGISTRY(
     BlobSerializerRegistry,
     TypeIdentifier,
     BlobSerializerBase,
     std::unique_ptr);
 
-CAFFE_DEFINE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
+C10_DEFINE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
 
 void DeserializeBlob(const string& content, Blob* result) {
   BlobProto blob_proto;
diff --git a/caffe2/core/blob_serializer_base.h b/caffe2/core/blob_serializer_base.h
index b51f3da21a30f4..4e0e3e4d6d18fe 100644
--- a/caffe2/core/blob_serializer_base.h
+++ b/caffe2/core/blob_serializer_base.h
@@ -3,8 +3,8 @@
 #include <string>
 #include <functional>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
@@ -57,13 +57,13 @@ class BlobSerializerBase {
 };
 
 // The Blob serialization registry and serializer creator functions.
-CAFFE_DECLARE_TYPED_REGISTRY(
+C10_DECLARE_TYPED_REGISTRY(
     BlobSerializerRegistry,
     TypeIdentifier,
     BlobSerializerBase,
     std::unique_ptr);
 #define REGISTER_BLOB_SERIALIZER(id, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
 // Creates an operator with the given operator definition.
 inline unique_ptr<BlobSerializerBase> CreateSerializer(TypeIdentifier id) {
   return BlobSerializerRegistry()->Create(id);
@@ -82,9 +82,9 @@ class CAFFE2_API BlobDeserializerBase {
   virtual void Deserialize(const BlobProto& proto, Blob* blob) = 0;
 };
 
-CAFFE_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
+C10_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
 #define REGISTER_BLOB_DESERIALIZER(name, ...) \
-  CAFFE_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
 // Creates an operator with the given operator definition.
 inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
   return BlobDeserializerRegistry()->Create(type);
diff --git a/caffe2/core/blob_stats.h b/caffe2/core/blob_stats.h
index 67f9e88e2edc62..5c9f80f518f91c 100644
--- a/caffe2/core/blob_stats.h
+++ b/caffe2/core/blob_stats.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/typeid.h"
 
 #include <unordered_map>
@@ -33,7 +33,7 @@ struct BlobStatRegistry {
 
 #define REGISTER_BLOB_STAT_GETTER(Type, BlobStatGetterClass)    \
   static BlobStatRegistry::Registrar<Type, BlobStatGetterClass> \
-      CAFFE_ANONYMOUS_VARIABLE(BlobStatRegistry)
+      C10_ANONYMOUS_VARIABLE(BlobStatRegistry)
 
 namespace BlobStat {
 
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index bb2f4ba6a91818..d856655433aa3e 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -3,6 +3,7 @@
 #include <mutex>
 
 #include <gtest/gtest.h>
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/blob_serialization.h"
 #include "caffe2/core/common.h"
@@ -11,7 +12,6 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/qtensor.h"
 #include "caffe2/core/qtensor_serialization.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/types.h"
 #include "caffe2/core/workspace.h"
@@ -967,7 +967,7 @@ CAFFE_KNOWN_TYPE(DummyType);
 
 namespace {
 REGISTER_BLOB_SERIALIZER((TypeMeta::Id<DummyType>()), DummyTypeSerializer);
-CAFFE_REGISTER_TYPED_CLASS(
+C10_REGISTER_TYPED_CLASS(
     BlobDeserializerRegistry,
     "DummyType",
     DummyTypeDeserializer);
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 2582a605adee55..d1803a6a2d2812 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -24,6 +24,7 @@
 
 // Macros used during the build of this caffe2 instance. This header file
 // is automatically generated by the cmake script during build.
+#include "caffe2/core/common.h"
 #include "caffe2/core/macros.h"
 
 #include "c10/macros/Macros.h"
diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc
index 9e39a85721186f..e2794bbd39d92f 100644
--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@@ -2,6 +2,7 @@
 
 #include <atomic>
 #include <cstdlib>
+#include <iostream>
 #include <sstream>
 
 #include "caffe2/core/asan.h"
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
index 720c2dcaa46de1..c0031cb0661ec8 100644
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@@ -12,7 +12,7 @@ CAFFE_KNOWN_TYPE(db::Cursor);
 
 namespace db {
 
-CAFFE_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+C10_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 
 // Below, we provide a bare minimum database "minidb" as a reference
 // implementation as well as a portable choice to store data.
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 39f8b6f3f02b0d..f6044ff35f8273 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -3,8 +3,8 @@
 
 #include <mutex>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
@@ -104,9 +104,9 @@ class CAFFE2_API DB {
 
 // Database classes are registered by their names so we can do optional
 // dependencies.
-CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+C10_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 #define REGISTER_CAFFE2_DB(name, ...) \
-  CAFFE_REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
 
 /**
  * Returns a database object of the given database type, source and mode. The
diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc
index a84d298466dc03..43131d8beebd27 100644
--- a/caffe2/core/flags.cc
+++ b/caffe2/core/flags.cc
@@ -1,6 +1,7 @@
 #include "caffe2/core/flags.h"
 
 #include <cstdlib>
+#include <iostream>
 #include <sstream>
 
 #include "caffe2/core/logging.h"
@@ -33,8 +34,7 @@ C10_EXPORT bool CommandLineFlagsHasBeenParsed() {
 
 #else  // CAFFE2_USE_GFLAGS
 
-
-CAFFE_DEFINE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
+C10_DEFINE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 
 namespace {
 static bool gCommandLineFlagsParsed = false;
diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
index 4e39c7bdebf137..98b137c2f723ef 100644
--- a/caffe2/core/flags.h
+++ b/caffe2/core/flags.h
@@ -20,7 +20,8 @@
 #ifndef CAFFE2_CORE_FLAGS_H_
 #define CAFFE2_CORE_FLAGS_H_
 
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
+#include "caffe2/core/common.h"
 
 namespace caffe2 {
 /**
@@ -142,7 +143,7 @@ class CAFFE2_API Caffe2FlagParser {
   bool success_;
 };
 
-CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
+C10_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 
 }  // namespace caffe2
 
diff --git a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
index e1b4ff2aebb0fc..3ad9336e6d2aee 100644
--- a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
+++ b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
@@ -55,6 +55,6 @@ GetAsyncNetHIPThreadPool(int hip_gpu_id, int pool_size, bool create_new) {
   }
 }
 
-CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, HIP, GetAsyncNetHIPThreadPool);
+C10_REGISTER_CREATOR(ThreadPoolRegistry, HIP, GetAsyncNetHIPThreadPool);
 
 } // namespace caffe2
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index cd057444d31cf4..beaee7b15f9ff0 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -3,6 +3,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <iostream>
 #include <numeric>
 
 // Common code that we use regardless of whether we use glog or not.
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index 77934f6be12d45..c72c34e37e8c00 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -19,7 +19,7 @@ CAFFE2_DEFINE_string(
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     NetRegistry,
     NetBase,
     const std::shared_ptr<const NetDef>&,
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 57fd53f1de4f12..30ef4bde50cab7 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -9,12 +9,12 @@
 #include <unordered_map>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
@@ -134,15 +134,15 @@ class CAFFE2_API ExecutorHelper {
   virtual ~ExecutorHelper() {}
 };
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     NetRegistry,
     NetBase,
     const std::shared_ptr<const NetDef>&,
     Workspace*);
 #define REGISTER_NET_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__)
 #define REGISTER_NET(name, ...) \
-  CAFFE_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__)
 
 /**
  * @brief Creates a network, accessing / creating blobs in the given workspace.
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index b40a8fa33778a7..ce5fdbe7b7ed80 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -408,14 +408,9 @@ void AsyncNetBase::finalizeEvents() {
 
 AsyncNetBase::~AsyncNetBase() {}
 
-CAFFE_DEFINE_SHARED_REGISTRY(
-    ThreadPoolRegistry,
-    TaskThreadPool,
-    int,
-    int,
-    bool);
-
-CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool);
+C10_DEFINE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool);
+
+C10_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool);
 
 /* static */
 std::shared_ptr<TaskThreadPool>
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 502233e7f045b4..29be2c04daccfc 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -1,11 +1,11 @@
 #ifndef CAFFE2_CORE_NET_ASYNC_BASE_H_
 #define CAFFE2_CORE_NET_ASYNC_BASE_H_
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/net_async_base.h"
 #include "caffe2/core/net_dag_utils.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/stats.h"
 #include "caffe2/core/timer.h"
 #include "caffe2/core/workspace.h"
@@ -139,12 +139,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   friend class tracing::Tracer;
 };
 
-CAFFE_DECLARE_SHARED_REGISTRY(
-    ThreadPoolRegistry,
-    TaskThreadPool,
-    int,
-    int,
-    bool);
+C10_DECLARE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool);
 
 class AsyncNetExecutorHelper : public ExecutorHelper {
  public:
diff --git a/caffe2/core/net_async_gpu_thread_pool_gpu.cc b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
index ca3f691bc49764..dc0bf118ab7956 100644
--- a/caffe2/core/net_async_gpu_thread_pool_gpu.cc
+++ b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
@@ -6,7 +6,7 @@ CAFFE2_DEFINE_int(caffe2_threads_per_gpu, 1, "Number of CPU threads per GPU");
 
 namespace caffe2 {
 
-CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool);
+C10_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool);
 
 std::shared_ptr<TaskThreadPool>
 GetAsyncNetGPUThreadPool(int gpu_id, int pool_size, bool create_new) {
diff --git a/caffe2/core/net_dag.h b/caffe2/core/net_dag.h
index ab3ce0f6f3fa10..7c66217a23ec4d 100644
--- a/caffe2/core/net_dag.h
+++ b/caffe2/core/net_dag.h
@@ -9,6 +9,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
@@ -16,7 +17,6 @@
 #include "caffe2/core/net_dag_utils.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/stats.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/timer.h"
diff --git a/caffe2/core/net_dag_utils.h b/caffe2/core/net_dag_utils.h
index 6debfbf7bd8053..0259f10f954652 100644
--- a/caffe2/core/net_dag_utils.h
+++ b/caffe2/core/net_dag_utils.h
@@ -10,13 +10,13 @@
 #include <unordered_set>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h
index c114fd8d224f21..5b8bc29be4dfae 100644
--- a/caffe2/core/net_simple.h
+++ b/caffe2/core/net_simple.h
@@ -3,10 +3,10 @@
 
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/core/net_simple_async.h b/caffe2/core/net_simple_async.h
index ea5aae959870f6..abe16f2013789f 100644
--- a/caffe2/core/net_simple_async.h
+++ b/caffe2/core/net_simple_async.h
@@ -3,10 +3,10 @@
 
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc
index fa8aee6d818366..b21246a6611789 100644
--- a/caffe2/core/observer_test.cc
+++ b/caffe2/core/observer_test.cc
@@ -1,11 +1,11 @@
 #include <gtest/gtest.h>
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/net_dag.h"
 #include "caffe2/core/net_simple.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/scope_guard.h"
 
 namespace caffe2 {
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 5f3f653b5a4b21..79be08c03b2325 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -316,31 +316,32 @@ std::map<DeviceType, OperatorRegistry*>* gDeviceTypeRegistry() {
   return &g_device_type_registry;
 }
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     CPUOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(CPU, CPUOperatorRegistry);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     CUDAOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(CUDA, CUDAOperatorRegistry);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     HIPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(HIP, HIPOperatorRegistry);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     GradientRegistry,
     GradientMakerBase,
-    const OperatorDef&, const vector<GradientWrapper>&);
+    const OperatorDef&,
+    const vector<GradientWrapper>&);
 
 GradientOpsMeta GetGradientForOp(
     const OperatorDef& def, const vector<GradientWrapper>& g_output) {
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 1a968c4c3755fe..8208eb271bdc1b 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -9,13 +9,13 @@
 #include <typeinfo>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_gradient.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/types.h"
 #include "caffe2/core/workspace.h"
@@ -778,13 +778,13 @@ CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER(
 //     registry function.
 // (2) Then, one can call the operator registry function to further create the
 //     operators.
-typedef Registry<
+typedef c10::Registry<
     std::string,
     std::unique_ptr<OperatorBase>,
     const OperatorDef&,
     Workspace*>
     OperatorRegistry;
-typedef Registry<
+typedef c10::Registry<
     std::string,
     std::unique_ptr<OperatorBase>,
     const OperatorDef&,
@@ -806,7 +806,7 @@ struct CAFFE2_API DeviceTypeRegisterer {
 
 #define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
   namespace {                                               \
-  static DeviceTypeRegisterer CAFFE_ANONYMOUS_VARIABLE(     \
+  static DeviceTypeRegisterer C10_ANONYMOUS_VARIABLE(       \
       DeviceType)(type, &registry_function);                \
   }
 
@@ -817,69 +817,67 @@ struct CAFFE2_API DeviceTypeRegisterer {
 // not depend on specific cuda or cudnn libraries. This means that we will be
 // able to compile it even when there is no cuda available - we simply do not
 // link any cuda or cudnn operators.
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     CPUOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CPU_OPERATOR(name, ...)                           \
   C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
   }                                                                \
-  CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_CPU_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_CPU_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     CUDAOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CUDA_OPERATOR(name, ...)                           \
   C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();   \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
   }                                                                 \
-  CAFFE_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_CUDA_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(                                       \
-      CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 // Macros for cudnn since we use it often
 #define REGISTER_CUDNN_OPERATOR(name, ...) \
   REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__)
 
 // Macros for HIP operators
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     HIPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR(name, ...)                           \
   C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
   }                                                                \
-  CAFFE_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_HIP_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(                                       \
-      HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 #define REGISTER_MIOPEN_OPERATOR(name, ...) \
   REGISTER_HIP_OPERATOR_WITH_ENGINE(name, MIOPEN, __VA_ARGS__)
diff --git a/caffe2/core/operator_c10wrapper.cc b/caffe2/core/operator_c10wrapper.cc
index 6fd62ec1cf63b4..523c467b170d58 100644
--- a/caffe2/core/operator_c10wrapper.cc
+++ b/caffe2/core/operator_c10wrapper.cc
@@ -2,7 +2,7 @@
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     C10OperatorRegistry,
     OperatorBase,
     const OperatorDef&,
diff --git a/caffe2/core/operator_c10wrapper.h b/caffe2/core/operator_c10wrapper.h
index 695319266901a8..57a3c370ba5e32 100644
--- a/caffe2/core/operator_c10wrapper.h
+++ b/caffe2/core/operator_c10wrapper.h
@@ -284,7 +284,7 @@ struct ParameterHelper final {
   }
 };
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     C10OperatorRegistry,
     OperatorBase,
     const OperatorDef&,
@@ -293,14 +293,14 @@ CAFFE_DECLARE_REGISTRY(
 // TODO Currently we only register the CPU variant. This is going to be fixed
 //      once the tensor detemplatization lands.
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH(OpSchemaDef, State, Name) \
-  CAFFE_REGISTER_CLASS(                                                     \
+  C10_REGISTER_CLASS(                                                       \
       C10OperatorRegistry,                                                  \
       Name,                                                                 \
       C10OperatorWrapper<OpSchemaDef, CPUContext, State, false, std::tuple<>>)
 
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_PARAMETERS( \
     OpSchemaDef, State, Name, ...)                                 \
-  CAFFE_REGISTER_CLASS(                                            \
+  C10_REGISTER_CLASS(                                              \
       C10OperatorRegistry,                                         \
       Name,                                                        \
       C10OperatorWrapper<                                          \
@@ -312,14 +312,14 @@ CAFFE_DECLARE_REGISTRY(
 
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_ARRAY_INPUT( \
     OpSchemaDef, State, Name)                                       \
-  CAFFE_REGISTER_CLASS(                                             \
+  C10_REGISTER_CLASS(                                               \
       C10OperatorRegistry,                                          \
       Name,                                                         \
       C10OperatorWrapper<OpSchemaDef, CPUContext, State, true, std::tuple<>>)
 
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_ARRAY_INPUT_AND_PARAMETERS( \
     OpSchemaDef, State, Name, ...)                                                 \
-  CAFFE_REGISTER_CLASS(                                                            \
+  C10_REGISTER_CLASS(                                                              \
       C10OperatorRegistry,                                                         \
       Name,                                                                        \
       C10OperatorWrapper<                                                          \
diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
index 3eea164c21b840..2eb5b581092c30 100644
--- a/caffe2/core/operator_gradient.h
+++ b/caffe2/core/operator_gradient.h
@@ -1,8 +1,8 @@
 #ifndef CAFFE2_CORE_OPERATOR_GRADIENT_H_
 #define CAFFE2_CORE_OPERATOR_GRADIENT_H_
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/proto_utils.h"
 
@@ -295,16 +295,16 @@ struct GradientNotImplementedYet : public GradientMakerBase {
   }
 };
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     GradientRegistry,
     GradientMakerBase,
     const OperatorDef&,
     const vector<GradientWrapper>&);
 
 #define REGISTER_GRADIENT(name, ...) \
-  CAFFE_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
 #define REGISTER_GRADIENT_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
 
 // NO_GRADIENT means that the operator does not need any gradient computation.
 #define NO_GRADIENT(name) REGISTER_GRADIENT(name, NoGradient)
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index 54a6a17b8a0d24..a938d8f56afc93 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -9,9 +9,9 @@
 #include <vector>
 #include <unordered_map>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/filler.h"
 
@@ -578,14 +578,14 @@ OpSchema::Cost PointwiseCostInference(
 
 #define OPERATOR_SCHEMA(name)                                       \
   C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
+  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =     \
       &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #else // CAFFE2_NO_OPERATOR_SCHEMA
 
 #define OPERATOR_SCHEMA(name)                                       \
   C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
+  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =     \
       1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #endif // CAFFE2_NO_OPERATOR_SCHEMA
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
deleted file mode 100644
index f026795b23c3e1..00000000000000
--- a/caffe2/core/registry.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/**
- * Simple registry implementation in Caffe2 that uses static variables to
- * register object creators during program initialization time.
- *
- * WARNING: this registry is not entirely thread-safe, as reads to
- * the registry are not protected by a mutex.  The safest mode of use
- * is to dlopen() *all* dynamic libraries that may write to the library
- * and synchronize prior to performing any reads on the registry.
- */
-#ifndef CAFFE2_CORE_REGISTRY_H_
-#define CAFFE2_CORE_REGISTRY_H_
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <mutex>
-
-#include <ATen/core/Registry.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/typeid.h"
-
-namespace caffe2 {
-
-/**
- * @brief A template class that allows one to register classes by keys.
- *
- * The keys are usually a string specifying the name, but can be anything that
- * can be used in a std::map.
- *
- * You should most likely not use the Registry class explicitly, but use the
- * helper macros below to declare specific registries as well as registering
- * objects.
- */
-template <class SrcType, class ObjectPtrType, class... Args>
-class Registry {
- public:
-  typedef std::function<ObjectPtrType(Args...)> Creator;
-
-  Registry() : registry_() {}
-
-  void Register(const SrcType& key, Creator creator) {
-    // The if statement below is essentially the same as the following line:
-    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
-    //                                   << " registered twice.";
-    // However, CHECK_EQ depends on google logging, and since registration is
-    // carried out at static initialization time, we do not want to have an
-    // explicit dependency on glog's initialization function.
-    std::lock_guard<std::mutex> lock(register_mutex_);
-    if (registry_.count(key) != 0) {
-      printf("Key already registered.\n");
-      at::PrintOffendingKey(key);
-      std::exit(1);
-    }
-    registry_[key] = creator;
-  }
-
-  void Register(const SrcType& key, Creator creator, const string& help_msg) {
-    Register(key, creator);
-    help_message_[key] = help_msg;
-  }
-
-  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
-
-  ObjectPtrType Create(const SrcType& key, Args... args) {
-    if (registry_.count(key) == 0) {
-      // Returns nullptr if the key is not registered.
-      return nullptr;
-    }
-    return registry_[key](args...);
-  }
-
-  /**
-   * Returns the keys currently registered as a vector.
-   */
-  vector<SrcType> Keys() {
-    vector<SrcType> keys;
-    for (const auto& it : registry_) {
-      keys.push_back(it.first);
-    }
-    return keys;
-  }
-
-  const CaffeMap<SrcType, string>& HelpMessage() const {
-    return help_message_;
-  }
-
-  const char* HelpMessage(const SrcType& key) const {
-    auto it = help_message_.find(key);
-    if (it == help_message_.end()) {
-      return nullptr;
-    }
-    return it->second.c_str();
-  }
-
- private:
-  CaffeMap<SrcType, Creator> registry_;
-  CaffeMap<SrcType, string> help_message_;
-  std::mutex register_mutex_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(Registry);
-};
-
-template <class SrcType, class ObjectPtrType, class... Args>
-class Registerer {
- public:
-  Registerer(
-      const SrcType& key,
-      Registry<SrcType, ObjectPtrType, Args...>* registry,
-      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
-      const string& help_msg = "") {
-    registry->Register(key, creator, help_msg);
-  }
-
-  template <class DerivedType>
-  static ObjectPtrType DefaultCreator(Args... args) {
-    // TODO(jiayq): old versions of NVCC does not handle make_unique well
-    // so we are forced to use a unique_ptr constructor here. Check if it is
-    // fine to use make_unique in the future.
-    // return make_unique<DerivedType>(args...);
-    return ObjectPtrType(new DerivedType(args...));
-  }
-};
-
-/**
- * CAFFE_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
- * str and ending with a number that varies with the line.
- * Pretty much a copy from 'folly/Preprocessor.h'
- */
-#define CAFFE_CONCATENATE_IMPL(s1, s2) s1##s2
-#define CAFFE_CONCATENATE(s1, s2) CAFFE_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __COUNTER__)
-#else
-#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __LINE__)
-#endif
-
-/**
- * CAFFE_DECLARE_TYPED_REGISTRY is a macro that expands to a function
- * declaration, as well as creating a convenient typename for its corresponding
- * registerer.
- */
-#define CAFFE_DECLARE_TYPED_REGISTRY(                               \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                \
-  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
-  RegistryName();                                                   \
-  typedef Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>   \
-      Registerer##RegistryName;
-
-#define CAFFE_DEFINE_TYPED_REGISTRY(                                         \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*          \
-  RegistryName() {                                                           \
-    static Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* registry = \
-        new Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();         \
-    return registry;                                                         \
-  }
-
-// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
-// creator with comma in its templated arguments.
-#define CAFFE_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
-  namespace {                                                                 \
-  static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key, RegistryName(), __VA_ARGS__);                                      \
-  }
-
-#define CAFFE_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
-  namespace {                                                                 \
-  static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key,                                                                    \
-      RegistryName(),                                                         \
-      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      at::demangle_type<__VA_ARGS__>());                                           \
-  }
-
-// CAFFE_DECLARE_REGISTRY and CAFFE_DEFINE_REGISTRY are hard-wired to use string
-// as the key
-// type, because that is the most commonly used cases.
-#define CAFFE_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DECLARE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
-
-#define CAFFE_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DEFINE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
-
-#define CAFFE_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DECLARE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
-
-#define CAFFE_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DEFINE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
-
-// CAFFE_REGISTER_CREATOR and CAFFE_REGISTER_CLASS are hard-wired to use string
-// as the key
-// type, because that is the most commonly used cases.
-#define CAFFE_REGISTER_CREATOR(RegistryName, key, ...) \
-  CAFFE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
-
-#define CAFFE_REGISTER_CLASS(RegistryName, key, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
-
-}  // namespace caffe2
-#endif  // CAFFE2_CORE_REGISTRY_H_
diff --git a/caffe2/core/registry_test.cc b/caffe2/core/registry_test.cc
deleted file mode 100644
index 7ad8ead553463a..00000000000000
--- a/caffe2/core/registry_test.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <iostream>
-#include <memory>
-
-#include "caffe2/core/registry.h"
-#include <gtest/gtest.h>
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-namespace {
-
-class Foo {
- public:
-  explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
-};
-
-CAFFE_DECLARE_REGISTRY(FooRegistry, Foo, int);
-CAFFE_DEFINE_REGISTRY(FooRegistry, Foo, int);
-#define REGISTER_FOO(clsname) \
-  CAFFE_REGISTER_CLASS(FooRegistry, clsname, clsname)
-
-class Bar : public Foo {
- public:
-  explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
-};
-REGISTER_FOO(Bar);
-
-class AnotherBar : public Foo {
- public:
-  explicit AnotherBar(int x) : Foo(x) {
-    LOG(INFO) << "AnotherBar " << x;
-  }
-};
-REGISTER_FOO(AnotherBar);
-
-TEST(RegistryTest, CanRunCreator) {
-  unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
-  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
-  unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
-  EXPECT_TRUE(another_bar != nullptr);
-}
-
-TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
-  EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
-}
-}
-}  // namespace caffe2
diff --git a/caffe2/core/transform.cc b/caffe2/core/transform.cc
index 5b3f80fbe3fc0a..549322abccc7da 100644
--- a/caffe2/core/transform.cc
+++ b/caffe2/core/transform.cc
@@ -10,7 +10,7 @@ namespace caffe2 {
 
 using transform::Graph;
 
-CAFFE_DEFINE_REGISTRY(TransformRegistry, Transform);
+C10_DEFINE_REGISTRY(TransformRegistry, Transform);
 
 std::vector<std::vector<int>> Transform::PatternMatch(const Graph& graph) {
   // checks if the node at index i is matched already or not
diff --git a/caffe2/core/transform.h b/caffe2/core/transform.h
index c6aaf119513847..723e14789d627c 100644
--- a/caffe2/core/transform.h
+++ b/caffe2/core/transform.h
@@ -150,9 +150,9 @@ class CAFFE2_API Transform {
 // Creates a Transform based on a key, which should be defined in registry.
 CAFFE2_API unique_ptr<Transform> CreateTransform(string key);
 
-CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform);
+C10_DECLARE_REGISTRY(TransformRegistry, Transform);
 #define REGISTER_TRANSFORM(name, ...) \
-  CAFFE_REGISTER_CLASS(TransformRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(TransformRegistry, name, __VA_ARGS__)
 
 // Create a Transform object from registry,
 // and immediately apply it to a Netdef.
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 2ad486c328f56d..324766359de607 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -11,8 +11,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/net.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/signal_handler.h"
diff --git a/caffe2/ideep/utils/ideep_operator.h b/caffe2/ideep/utils/ideep_operator.h
index 5cccbb509725c2..89e0691c29960a 100644
--- a/caffe2/ideep/utils/ideep_operator.h
+++ b/caffe2/ideep/utils/ideep_operator.h
@@ -6,21 +6,21 @@
 
 namespace caffe2 {
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     IDEEPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 
 #define REGISTER_IDEEP_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_IDEEP_OPERATOR(name, ...) \
-  CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_IDEEP_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_IDEEP_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 // IDEEPOperator is the base scaffolding of the operators that uses IDEEP. It
 // provides a few operators that are useful to IDEEP specific implementations.
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index 020e22fa6143ed..9aa3f195118039 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -8,7 +8,7 @@ namespace caffe2 {
 
 CAFFE_KNOWN_TYPE(ideep::tensor);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     IDEEPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
diff --git a/caffe2/mkl/mkl_operator.cc b/caffe2/mkl/mkl_operator.cc
index bf5b460d0920be..8fba56da8474d6 100644
--- a/caffe2/mkl/mkl_operator.cc
+++ b/caffe2/mkl/mkl_operator.cc
@@ -9,7 +9,7 @@ CAFFE2_DEFINE_bool(
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     MKLOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
diff --git a/caffe2/mkl/utils/mkl_operator.h b/caffe2/mkl/utils/mkl_operator.h
index 2236e9267af542..0f028fbfaa8c01 100644
--- a/caffe2/mkl/utils/mkl_operator.h
+++ b/caffe2/mkl/utils/mkl_operator.h
@@ -10,20 +10,20 @@ CAFFE2_DECLARE_bool(caffe2_mkl_memonger_in_use);
 
 namespace caffe2 {
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     MKLOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_MKL_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(MKLOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(MKLOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_MKL_OPERATOR(name, ...) \
-  CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(MKLOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_MKL_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(MKLOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(MKLOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_MKL_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(MKLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 namespace mkl {
 // MKLOperator is the base scaffolding of the operators that uses MKLDNN. It
diff --git a/caffe2/mobile/contrib/arm-compute/core/net_gl.h b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
index 1dc93dedc3fff3..48a47ff87f3351 100644
--- a/caffe2/mobile/contrib/arm-compute/core/net_gl.h
+++ b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
@@ -3,10 +3,10 @@
 
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.cc b/caffe2/mobile/contrib/arm-compute/core/operator.cc
index bd4337aa85e7cf..cddd0b0129c6a0 100644
--- a/caffe2/mobile/contrib/arm-compute/core/operator.cc
+++ b/caffe2/mobile/contrib/arm-compute/core/operator.cc
@@ -2,8 +2,11 @@
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &,
-                      Workspace *);
+C10_DEFINE_REGISTRY(
+    GLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(DeviceType::OPENGL, GLOperatorRegistry);
 
 } // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.h b/caffe2/mobile/contrib/arm-compute/core/operator.h
index 037173054f7715..4df78c7734b849 100644
--- a/caffe2/mobile/contrib/arm-compute/core/operator.h
+++ b/caffe2/mobile/contrib/arm-compute/core/operator.h
@@ -1,26 +1,29 @@
 #ifndef CAFFE2_OPENGL_OPERATOR_H_
 #define CAFFE2_OPENGL_OPERATOR_H_
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/registry.h"
 
 namespace caffe2 {
 
-CAFFE_DECLARE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &,
-                       Workspace *);
-#define REGISTER_GL_OPERATOR_CREATOR(key, ...)                                 \
-  CAFFE_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_GL_OPERATOR(name, ...)                                        \
-  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                  \
-  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() {              \
-    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                            \
-  }                                                                            \
-  CAFFE_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_GL_OPERATOR_STR(str_name, ...)                                \
-  CAFFE_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__)
+C10_DECLARE_REGISTRY(
+    GLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+#define REGISTER_GL_OPERATOR_CREATOR(key, ...) \
+  C10_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR(name, ...)                           \
+  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();     \
+  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() { \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();               \
+  }                                                               \
+  C10_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR_STR(str_name, ...) \
+  C10_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__)
 
-#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...)                    \
-  CAFFE_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...) \
+  C10_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 } // namespace caffe2
 
diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
index b6966fe89c76fc..f70149110378fc 100644
--- a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
+++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 REGISTER_CPU_OPERATOR(
diff --git a/caffe2/operators/fused_rowwise_random_quantization_ops.cc b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
index ca5d8f25d3a9f2..9dec789393d993 100644
--- a/caffe2/operators/fused_rowwise_random_quantization_ops.cc
+++ b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/fused_rowwise_random_quantization_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
index 6dc47d7781d131..513ac64e795c41 100644
--- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
+++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
index bfa1a666e6ed9d..5ecfceef5dc612 100644
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/lengths_reducer_rowwise_8bit_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index f9956060b75cfd..6a7520716032bc 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -56,7 +56,7 @@ int getGroup(std::map<std::string, caffe2::Argument>& argMap) {
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(ConverterRegistry, Converter);
+C10_DEFINE_REGISTRY(ConverterRegistry, Converter);
 
 std::map<std::string, caffe2::Argument> Converter::getArgumentsFromOperator(
     caffe2::OperatorDef op) {
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index c106fc66057916..f5933313c739ed 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -44,9 +44,9 @@ class CAFFE2_API Converter {
   virtual ~Converter() {}
 };
 
-CAFFE_DECLARE_REGISTRY(ConverterRegistry, Converter);
+C10_DECLARE_REGISTRY(ConverterRegistry, Converter);
 #define REGISTER_CONVERTER(name, cls) \
-  CAFFE_REGISTER_CLASS(ConverterRegistry, name, cls)
+  C10_REGISTER_CLASS(ConverterRegistry, name, cls)
 
 #define TRIVIAL_CONVERTER(opName)                                             \
   class opName##Converter : public Converter {                                \
diff --git a/caffe2/opt/passes.cc b/caffe2/opt/passes.cc
index e9f05a9df01c79..74250d1bbb3b12 100644
--- a/caffe2/opt/passes.cc
+++ b/caffe2/opt/passes.cc
@@ -2,7 +2,11 @@
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
-CAFFE_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+C10_DEFINE_REGISTRY(
+    WorkspaceOptimizationPassRegistry,
+    WorkspaceOptimizationPass,
+    NNModule*,
+    Workspace*);
+C10_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
 
 } // namespace caffe2
diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h
index 056dbcf8779b3b..fc15dcad13fe7b 100644
--- a/caffe2/opt/passes.h
+++ b/caffe2/opt/passes.h
@@ -40,9 +40,13 @@ class CAFFE2_API WorkspaceOptimizationPass : public OptimizationPass {
   Workspace* ws_;
 };
 
-CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
+C10_DECLARE_REGISTRY(
+    WorkspaceOptimizationPassRegistry,
+    WorkspaceOptimizationPass,
+    NNModule*,
+    Workspace*);
 #define REGISTER_WS_OPT_PASS(clsname) \
-  CAFFE_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
 #define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname)      \
   class passname : public WorkspaceOptimizationPass {           \
    public:                                                      \
@@ -53,9 +57,9 @@ CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationP
   };                                                            \
   REGISTER_WS_OPT_PASS(passname);
 
-CAFFE_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+C10_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
 #define REGISTER_OPT_PASS(clsname) \
-  CAFFE_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
 #define REGISTER_OPT_PASS_FROM_FUNC(passname, funcname) \
   class passname : public OptimizationPass {            \
    public:                                              \
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 9a1d715bfdf225..7062ead045df1c 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -53,12 +53,12 @@ static std::string gCurrentWorkspaceName;
 BlobFetcherBase::~BlobFetcherBase() {}
 BlobFeederBase::~BlobFeederBase() {}
 
-CAFFE_DEFINE_TYPED_REGISTRY(
+C10_DEFINE_TYPED_REGISTRY(
     BlobFetcherRegistry,
     TypeIdentifier,
     BlobFetcherBase,
     std::unique_ptr);
-CAFFE_DEFINE_TYPED_REGISTRY(
+C10_DEFINE_TYPED_REGISTRY(
     BlobFeederRegistry,
     caffe2::DeviceType,
     BlobFeederBase,
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 4f81569e429369..d18d728f282afa 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -60,24 +60,24 @@ class BlobFeederBase {
   Feed(const DeviceOption& option, PyArrayObject* array, Blob* blob) = 0;
 };
 
-C10_EXPORT CAFFE_DECLARE_TYPED_REGISTRY(
+C10_DECLARE_TYPED_REGISTRY(
     BlobFetcherRegistry,
     TypeIdentifier,
     BlobFetcherBase,
     std::unique_ptr);
 #define REGISTER_BLOB_FETCHER(id, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
 inline unique_ptr<BlobFetcherBase> CreateFetcher(TypeIdentifier id) {
   return BlobFetcherRegistry()->Create(id);
 }
 
-CAFFE_DECLARE_TYPED_REGISTRY(
+C10_DECLARE_TYPED_REGISTRY(
     BlobFeederRegistry,
     DeviceType,
     BlobFeederBase,
     std::unique_ptr);
 #define REGISTER_BLOB_FEEDER(device_type, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
 inline unique_ptr<BlobFeederBase> CreateFeeder(int device_type) {
   return BlobFeederRegistry()->Create(
       caffe2::ProtoToType(static_cast<DeviceTypeProto>(device_type)));
diff --git a/caffe2/python/pybind_state_registry.cc b/caffe2/python/pybind_state_registry.cc
index 9dfb87731ff4de..77fabf34256480 100644
--- a/caffe2/python/pybind_state_registry.cc
+++ b/caffe2/python/pybind_state_registry.cc
@@ -5,7 +5,7 @@ namespace python {
 
 namespace py = pybind11;
 
-CAFFE_DEFINE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
+C10_DEFINE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
 
 } // namespace python
 } // namespace caffe2
diff --git a/caffe2/python/pybind_state_registry.h b/caffe2/python/pybind_state_registry.h
index a107e7db8ea0ad..18bb0a3dbaa01d 100644
--- a/caffe2/python/pybind_state_registry.h
+++ b/caffe2/python/pybind_state_registry.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <pybind11/pybind11.h>
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 namespace python {
@@ -14,19 +14,16 @@ struct PybindAddition {
   virtual ~PybindAddition(){};
 };
 
-CAFFE_DECLARE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
+C10_DECLARE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
 
-#define REGISTER_PYBIND_ADDITION(funcname)        \
-  namespace {                                     \
-  struct funcname##Impl : public PybindAddition { \
-    funcname##Impl(py::module& m) {               \
-      funcname(m);                                \
-    }                                             \
-  };                                              \
-  CAFFE_REGISTER_CLASS(                           \
-      PybindAdditionRegistry,                     \
-      funcname##Impl,                             \
-      funcname##Impl);                            \
+#define REGISTER_PYBIND_ADDITION(funcname)                                    \
+  namespace {                                                                 \
+  struct funcname##Impl : public PybindAddition {                             \
+    funcname##Impl(py::module& m) {                                           \
+      funcname(m);                                                            \
+    }                                                                         \
+  };                                                                          \
+  C10_REGISTER_CLASS(PybindAdditionRegistry, funcname##Impl, funcname##Impl); \
   }
 
 } // namespace python
diff --git a/setup.py b/setup.py
index 94455ed1cf7be7..2ea6871d593186 100644
--- a/setup.py
+++ b/setup.py
@@ -1202,6 +1202,7 @@ def make_relative_rpath(path):
                 'lib/include/caffe2/utils/*.h',
                 'lib/include/c10/*.h',
                 'lib/include/c10/macros/*.h',
+                'lib/include/c10/util/*.h',
                 'lib/include/torch/*.h',
                 'lib/include/torch/csrc/*.h',
                 'lib/include/torch/csrc/api/include/torch/*.h',

From a72603f8f8ffdd77a826129b18e839df3fe3a109 Mon Sep 17 00:00:00 2001
From: Freddie Mendoza <mendoza1@us.ibm.com>
Date: Thu, 27 Sep 2018 07:05:19 -0700
Subject: [PATCH 14/82] Fix for ppc64le jit graph difference in sigmoid
 backward, see #10726 (#11579)

Summary:
As reported in Issue #10726, the jit compiler, when running on ppc64le,  may produce an isomorphic output but fail a diff test against the expected output file.  The expected output file is created from a test that was ran on x86_64.  This ensures that if ppc64le test output is different, the output is instead compared to an expected output file created when the test is run on a ppc64le system.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11579

Differential Revision: D10080890

Pulled By: soumith

fbshipit-source-id: 7249bf6b5dfa7c853368a3688a982bc9ed642bc9
---
 .../TestScript.test_lstm_fusion_cuda-backward.expect | 12 ++++++------
 ...estScript.test_milstm_fusion_cuda-backward.expect | 12 ++++++------
 torch/csrc/jit/autodiff.cpp                          |  5 ++++-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
index cbdbc744b5e85d..3674a3fbc07d2b 100644
--- a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
@@ -56,8 +56,8 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %23 : Float(*, *) = aten::neg(%3)
   %24 : int = prim::Constant[value=1]()
   %25 : Float(*, *) = aten::add(%23, %24, %24)
-  %26 : Float(*, *) = aten::mul(%19, %3)
-  %27 : Float(*, *) = aten::mul(%26, %25)
+  %26 : Float(*, *) = aten::mul(%25, %3)
+  %27 : Float(*, *) = aten::mul(%26, %19)
   %28 : Float(*, *) = aten::mul(%2, %2)
   %29 : Float(*, *) = aten::neg(%28)
   %30 : int = prim::Constant[value=1]()
@@ -66,13 +66,13 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %33 : Float(*, *) = aten::neg(%1)
   %34 : int = prim::Constant[value=1]()
   %35 : Float(*, *) = aten::add(%33, %34, %34)
-  %36 : Float(*, *) = aten::mul(%22, %1)
-  %37 : Float(*, *) = aten::mul(%36, %35)
+  %36 : Float(*, *) = aten::mul(%35, %1)
+  %37 : Float(*, *) = aten::mul(%36, %22)
   %38 : Float(*, *) = aten::neg(%0)
   %39 : int = prim::Constant[value=1]()
   %40 : Float(*, *) = aten::add(%38, %39, %39)
-  %41 : Float(*, *) = aten::mul(%20, %0)
-  %42 : Float(*, *) = aten::mul(%41, %40)
+  %41 : Float(*, *) = aten::mul(%40, %0)
+  %42 : Float(*, *) = aten::mul(%41, %20)
   %43 : Float(*, *) = prim::FusedConcat[dim=1](%42, %37, %32, %27)
   return (%43, %18);
 }
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
index b0dc85644751d8..fb14a35296623a 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
@@ -62,8 +62,8 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %20 : Float(*, *) = aten::neg(%3)
   %21 : int = prim::Constant[value=1]()
   %22 : Float(*, *) = aten::add(%20, %21, %21)
-  %23 : Float(*, *) = aten::mul(%8, %3)
-  %24 : Float(*, *) = aten::mul(%23, %22)
+  %23 : Float(*, *) = aten::mul(%22, %3)
+  %24 : Float(*, *) = aten::mul(%23, %8)
   %25 : Float(*, *) = aten::mul(%2, %2)
   %26 : Float(*, *) = aten::neg(%25)
   %27 : int = prim::Constant[value=1]()
@@ -72,13 +72,13 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %30 : Float(*, *) = aten::neg(%1)
   %31 : int = prim::Constant[value=1]()
   %32 : Float(*, *) = aten::add(%30, %31, %31)
-  %33 : Float(*, *) = aten::mul(%19, %1)
-  %34 : Float(*, *) = aten::mul(%33, %32)
+  %33 : Float(*, *) = aten::mul(%32, %1)
+  %34 : Float(*, *) = aten::mul(%33, %19)
   %35 : Float(*, *) = aten::neg(%0)
   %36 : int = prim::Constant[value=1]()
   %37 : Float(*, *) = aten::add(%35, %36, %36)
-  %38 : Float(*, *) = aten::mul(%17, %0)
-  %39 : Float(*, *) = aten::mul(%38, %37)
+  %38 : Float(*, *) = aten::mul(%37, %0)
+  %39 : Float(*, *) = aten::mul(%38, %17)
   %40 : Float(*, *) = prim::FusedConcat[dim=1](%39, %34, %29, %24)
   return (%40);
 }
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 009bf68ae3f6da..80e196c1b8ac59 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -166,7 +166,10 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
       return {nullptr, -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
 
     } else if (node->matches("aten::sigmoid(Tensor self) -> Tensor")) {
-      return {grads.at(0) * outputs.at(0) * (1 - outputs.at(0))};
+      // TODO: The order of operations matter in this case. This 
+      // works for ppc64le and x86_64. Need to look at why the 
+      // order matters.
+      return {(1 - outputs.at(0)) * outputs.at(0) * grads.at(0)};
 
     } else if (node->matches("aten::tanh(Tensor self) -> Tensor")) {
       return {grads.at(0) * (1 - outputs.at(0) * outputs.at(0))};

From 13cf39294d7d4d6d3bff777e5e20def7b4b40377 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq@fb.com>
Date: Thu, 27 Sep 2018 09:58:23 -0700
Subject: [PATCH 15/82] Remove ATen/Error.h and use ATen/core/Error.h instead.
 (#12132)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12132

TSIA. No code change involved.

Reviewed By: bwasti

Differential Revision: D10083237

fbshipit-source-id: bdab029015b9d0f1fa1f866c68aa5945cc68db9d
---
 aten/src/ATen/detail/ComplexHooksInterface.h   | 2 +-
 aten/src/ATen/native/PixelShuffle.cpp          | 2 +-
 torch/csrc/api/src/serialize/input-archive.cpp | 2 +-
 torch/csrc/autograd/engine.cpp                 | 2 +-
 torch/csrc/jit/pybind_utils.h                  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/detail/ComplexHooksInterface.h b/aten/src/ATen/detail/ComplexHooksInterface.h
index ec2995a498dc19..52f835a30cc17b 100644
--- a/aten/src/ATen/detail/ComplexHooksInterface.h
+++ b/aten/src/ATen/detail/ComplexHooksInterface.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 #include "c10/util/Registry.h"
 
 namespace at {
diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp
index 1f93ecbc8235ab..d16458e5ad80a6 100644
--- a/aten/src/ATen/native/PixelShuffle.cpp
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@@ -1,7 +1,7 @@
 #include "ATen/native/TensorTransformations.h"
 
 #include <ATen/NativeFunctions.h>
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <algorithm>
 #include <vector>
diff --git a/torch/csrc/api/src/serialize/input-archive.cpp b/torch/csrc/api/src/serialize/input-archive.cpp
index bd6995d67d69e9..11e97bce08f564 100644
--- a/torch/csrc/api/src/serialize/input-archive.cpp
+++ b/torch/csrc/api/src/serialize/input-archive.cpp
@@ -6,7 +6,7 @@
 #include <torch/csrc/jit/import.h>
 #include <torch/csrc/jit/script/module.h>
 
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <memory>
 #include <string>
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 1847bb65b08f8a..a5edc29833633a 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -8,7 +8,7 @@
 
 #include <ATen/DeviceGuard.h>
 #include <ATen/ExpandUtils.h>
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <atomic>
 #include <condition_variable>
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index c78b268cb2da75..5a012970f2c4f6 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -9,7 +9,7 @@
 #include "torch/csrc/utils/pybind.h"
 #include "torch/csrc/utils/auto_gil.h"
 
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <algorithm>
 #include <cstddef>

From 6e7e63fda3a4ee1254a17480692b93148954b8b9 Mon Sep 17 00:00:00 2001
From: "Cheng,Penghui" <penghui.cheng@intel.com>
Date: Thu, 27 Sep 2018 13:37:02 -0700
Subject: [PATCH 16/82] Implementation MomentumSGD/MomentumSGDUpdate operators
 for mkl-dnn (#11686)

Summary:
the speed-up of a single operation is up to 6X on BDW.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11686

Reviewed By: yinghai

Differential Revision: D9828129

Pulled By: wesolwsk

fbshipit-source-id: 7dbacea90609e18438f6fe1229c641937d0696c8
---
 caffe2/ideep/operators/momentum_sgd_op.cc | 125 ++++++++++++++++++++++
 caffe2/python/ideep/moment_sgd_op_test.py |  61 +++++++++++
 2 files changed, 186 insertions(+)
 create mode 100644 caffe2/ideep/operators/momentum_sgd_op.cc
 create mode 100644 caffe2/python/ideep/moment_sgd_op_test.py

diff --git a/caffe2/ideep/operators/momentum_sgd_op.cc b/caffe2/ideep/operators/momentum_sgd_op.cc
new file mode 100644
index 00000000000000..320780c12ffe1d
--- /dev/null
+++ b/caffe2/ideep/operators/momentum_sgd_op.cc
@@ -0,0 +1,125 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+void momentum_sgd_update(
+    const int N,
+    const float* g,
+    const float* m,
+    float* ng,
+    float* nm,
+    const float* lr,
+    const float momentum,
+    const bool nesterov,
+    float* param) {
+  const float LR = lr[0];
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (auto i = 0; i < N; ++i) {
+    if (!nesterov) {
+      const float adjusted_gradient = LR * g[i] + momentum * m[i];
+      nm[i] = adjusted_gradient;
+      ng[i] = adjusted_gradient;
+    } else {
+      const float mi = m[i];
+      const float mi_new = momentum * mi + LR * g[i];
+      nm[i] = mi_new;
+      ng[i] = (1 + momentum) * mi_new - momentum * mi;
+    }
+
+    if (param) {
+      param[i] -= ng[i];
+    }
+  }
+}
+
+class IDEEPMomentumSGDOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPMomentumSGDOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
+    if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
+      Output(OUTPUT_GRAD)->reinit(Input(GRAD).get_descriptor());
+    }
+    if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
+      Output(OUTPUT_MOMENTUM)->reinit(Input(MOMENTUM).get_descriptor());
+    }
+
+    // TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
+    const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
+    CAFFE_ENFORCE(lr.size() == 1);
+
+    momentum_sgd_update(
+        Input(GRAD).get_nelems(),
+        static_cast<float*>(Input(GRAD).get_data_handle()),
+        static_cast<float*>(Input(MOMENTUM).get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
+        lr.template data<float>(),
+        momentum_,
+        nesterov_,
+        nullptr);
+    return true;
+  }
+
+ protected:
+  float momentum_{0.9};
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM);
+};
+
+class IDEEPMomentumSGDUpdateOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+  IDEEPMomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
+    if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
+      Output(OUTPUT_GRAD)->reinit(Input(GRAD).get_descriptor());
+    }
+    if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
+      Output(OUTPUT_MOMENTUM)->reinit(Input(MOMENTUM).get_descriptor());
+    }
+
+    // TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
+    const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
+    CAFFE_ENFORCE(lr.size() == 1);
+
+    momentum_sgd_update(
+        Input(GRAD).get_nelems(),
+        static_cast<float*>(Input(GRAD).get_data_handle()),
+        static_cast<float*>(Input(MOMENTUM).get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
+        lr.template data<float>(),
+        momentum_,
+        nesterov_,
+        static_cast<float*>(Output(OUTPUT_PARAM)->get_data_handle()));
+    return true;
+  }
+
+ protected:
+  float momentum_{0.9};
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
+};
+
+REGISTER_IDEEP_OPERATOR(MomentumSGD, IDEEPMomentumSGDOp);
+REGISTER_IDEEP_OPERATOR(MomentumSGDUpdate, IDEEPMomentumSGDUpdateOp);
+
+} // namespace caffe2
diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py
new file mode 100644
index 00000000000000..90b49a8600d76c
--- /dev/null
+++ b/caffe2/python/ideep/moment_sgd_op_test.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class TestMomentumSGDUpdateOps(hu.HypothesisTestCase):
+    @given(n=st.integers(4, 8), nesterov=st.booleans(),
+           **mu.gcs)
+    def test_MomentumSGDUpdate(self, n, nesterov, gc, dc):
+        param = np.random.rand(n).astype(np.float32)
+        grad = np.random.rand(n).astype(np.float32)
+        lr = np.random.rand(1).astype(np.float32)
+        param_momentum = np.random.rand(n).astype(np.float32)
+        momentum = 0.9
+        op = core.CreateOperator(
+            "MomentumSGDUpdate",
+            ["grad", "param_momentum", "lr", "param"],
+            ["grad", "param_momentum", "param"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+        )
+        # Iter lives on the CPU
+        input_device_options = {'lr': hu.cpu_do}
+
+        self.assertDeviceChecks(
+            dc,
+            op,
+            [grad, param_momentum, lr, param],
+            [0],
+            input_device_options=input_device_options,
+            threshold=0.001)
+
+        op_noparam = core.CreateOperator(
+            "MomentumSGD",
+            ["grad", "param_momentum", "lr"],
+            ["grad", "param_momentum"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+        )
+
+        self.assertDeviceChecks(
+            dc,
+            op_noparam,
+            [grad, param_momentum, lr],
+            [0],
+            input_device_options=input_device_options,
+            threshold=0.001)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 80e3081c28c1a235602c2e9858335851c31987ac Mon Sep 17 00:00:00 2001
From: wuhuikx <hui.h.wu@intel.com>
Date: Thu, 27 Sep 2018 13:56:53 -0700
Subject: [PATCH 17/82] Add observers for mkldnn fallback operators (#9093)

Summary:
Add observers for ideep operators.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9093

Reviewed By: salexspb

Differential Revision: D9952949

Pulled By: yinghai

fbshipit-source-id: 1678d1a738f8781dc75eb3cb9dfb309f7b7934fb
---
 caffe2/ideep/utils/ideep_operator.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/caffe2/ideep/utils/ideep_operator.h b/caffe2/ideep/utils/ideep_operator.h
index 89e0691c29960a..f9b6a831061388 100644
--- a/caffe2/ideep/utils/ideep_operator.h
+++ b/caffe2/ideep/utils/ideep_operator.h
@@ -51,7 +51,10 @@ class IDEEPOperator : public OperatorBase {
     // FinishDeviceComputation,
     // it is always just a re-route to RunOnDevice().
     try {
-      return RunOnDevice();
+      StartAllObservers();
+      bool result =  RunOnDevice();
+      StopAllObservers();
+      return result;
     } catch (EnforceNotMet& err) {
       err.AppendMessage(getErrorMsg());
       throw;

From c35f85a6d481db42e6406f64d5623ceafa698dbf Mon Sep 17 00:00:00 2001
From: "Gu, Jinghui" <jinghui.gu@intel.com>
Date: Thu, 27 Sep 2018 14:38:02 -0700
Subject: [PATCH 18/82] Export symbols for pybind and other libs after caffe2
 rebase (#11975)

Summary:
Export symbols for pybind and other libs after caffe2 rebase
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11975

Differential Revision: D10042615

Pulled By: yinghai

fbshipit-source-id: 6de562d99403099113093716834abc51bf726e94
---
 caffe2/ideep/operators/operator_fallback_ideep.h | 2 +-
 caffe2/ideep/utils/ideep_register.cc             | 2 +-
 caffe2/mkl/utils/mkl_memory.h                    | 4 ++--
 third_party/ideep                                | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 3226a08c4af9cf..0292dbd5d5a637 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -36,7 +36,7 @@ namespace caffe2 {
  *                            IDEEPFallbackOp<MyMagicOp, SkipIndices<0>>);
  */
 template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
-class IDEEPFallbackOp final : public IDEEPOperator {
+class C10_EXPORT IDEEPFallbackOp final : public IDEEPOperator {
  public:
   USE_IDEEP_DEF_ALIASES();
   USE_IDEEP_OPERATOR_FUNCTIONS();
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index 9aa3f195118039..9fe3108c032957 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -27,7 +27,7 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
 
-BaseStaticContext* GetIDEEPStaticContext() {
+C10_EXPORT BaseStaticContext* GetIDEEPStaticContext() {
   static IDEEPStaticContext context;
   return &context;
 }
diff --git a/caffe2/mkl/utils/mkl_memory.h b/caffe2/mkl/utils/mkl_memory.h
index 736d8ede8cf53d..ffa1899197f2ba 100644
--- a/caffe2/mkl/utils/mkl_memory.h
+++ b/caffe2/mkl/utils/mkl_memory.h
@@ -148,7 +148,7 @@ class LayoutWrapper {
  * Most of the MKLMemory functions are not thread safe.
  */
 template <typename T>
-class MKLMemory {
+class C10_EXPORT MKLMemory {
  public:
   // Initializes an empty MKLMemory.
   MKLMemory() {}
@@ -460,7 +460,7 @@ class MKLMemory {
     return dims_;
   }
 
-  inline const int ndim() const { return dims_.size(); }
+  inline int ndim() const { return dims_.size(); }
 
   inline int dim32(const int i) const {
     CAFFE_ENFORCE_LT(dims_.at(i), std::numeric_limits<int>::max());
diff --git a/third_party/ideep b/third_party/ideep
index 4bd9a6800bf7db..dedff8fb8193fe 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit 4bd9a6800bf7db068187619e0582d34dec9651dc
+Subproject commit dedff8fb8193fe3a1ea893d4bc852f8ea395b6b3

From 1619264ca52fad13f6a24ef1c03104c998c5d0b9 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 17:27:56 -0700
Subject: [PATCH 19/82] Make ATen-core and caffe2 mutually recursive / merge
 template data<T>() (#11970)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11970

Adds an ATen-core-headers target, which caffe2_cpu_internal depends
on, and makes ATen-core depend on caffe2_headers.  If you link against
ATen-core, you must ALSO link against caffe2_cpu_internal; if you
link against caffe2_cpu_internal, you must ALSO link against ATen-core,
otherwise you'll have undefined symbols.

Then, we merge template data<T>() method with Caffe2 implementation,
demonstrating that includes to Caffe2 (core) from ATen/core are working

Reviewed By: jerryzh168

Differential Revision: D9967509

fbshipit-source-id: 3d220c38b2c3c646f8ff2884fdcc889fa9276c7a
---
 aten/src/ATen/core/TensorImpl.h | 14 ++++++++++++++
 setup.py                        |  1 +
 2 files changed, 15 insertions(+)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 27232e2a3a8e97..247b3828860db7 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -10,6 +10,8 @@
 #include "ATen/core/LegacyTypeDispatch.h"
 #include "ATen/core/Backend.h"
 
+#include "caffe2/core/logging.h"
+
 struct THTensor;
 
 namespace at {
@@ -100,6 +102,18 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   template <typename T>
   inline T * data() const {
     AT_ASSERT(!is_variable());
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_.data() || numel_ == 0,
+        "The tensor is of non-zero shape, but its data is not allocated yet. "
+        "Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_.IsType<T>(),
+        "Tensor type mismatch, caller expects elements to be ",
+        caffe2::TypeMeta::TypeName<T>(),
+        ", while tensor contains ",
+        data_type_.name(),
+        ". ");
     return storage_.data<T>() + storage_offset_;
   }
 
diff --git a/setup.py b/setup.py
index 2ea6871d593186..a8cdac91e92369 100644
--- a/setup.py
+++ b/setup.py
@@ -1203,6 +1203,7 @@ def make_relative_rpath(path):
                 'lib/include/c10/*.h',
                 'lib/include/c10/macros/*.h',
                 'lib/include/c10/util/*.h',
+                'lib/include/caffe2/core/*.h',
                 'lib/include/torch/*.h',
                 'lib/include/torch/csrc/*.h',
                 'lib/include/torch/csrc/api/include/torch/*.h',

From f6abd16a9dc40793fee71d6b287b26f488037310 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 17:27:58 -0700
Subject: [PATCH 20/82] Merge TensorImpl. (#11971)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11971

- Switched TensorImpl::data<T>() to use Storage::unsafe_data<T>() to work
  around an outstanding bug in the Storage::data<T>() implementation
  where it only works on Ts which are valid ScalarType
- Qualify a bunch of identifiers which still live in caffe2:: namespace
- strides returns an IntList now
- s/update_strides/update_to_contiguous_strides/
- Correctly compute type_id_ for the Storage only constructor from Caffe2.
  This is special cased to only work for CPU and CUDA dense tensors.
- Fix some signed-unsigned comparisons in Caffe2 code (OSS build for
  ATen/core has more restrictive warning tests.)

Reviewed By: jerryzh168

Differential Revision: D9995559

fbshipit-source-id: 9c74032e011189e1c7e9a98d20f2bd1e25ad2e5c
---
 aten/src/ATen/core/StorageImpl.h |   2 +
 aten/src/ATen/core/TensorImpl.h  | 904 +++++++++++++++++++++++++++++-
 caffe2/core/tensor.h             |   6 +-
 caffe2/core/tensor_impl.h        | 910 +------------------------------
 4 files changed, 910 insertions(+), 912 deletions(-)

diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h
index bba2df4e0d1bec..a92b14d147c5ae 100644
--- a/aten/src/ATen/core/StorageImpl.h
+++ b/aten/src/ATen/core/StorageImpl.h
@@ -74,6 +74,8 @@ struct CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
 
   template <typename T>
   inline T* data() const {
+    // TODO: This is bad: it means storage.data<T>() calls only work on
+    // T that are valid ScalarType.  FIXME!
     auto data_type_T = at::scalarTypeToDataType(at::CTypeToScalarType<T>::to());
     if (dtype().id() != data_type_T) {
       AT_ERROR(
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 247b3828860db7..826fa28fe6e229 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -9,26 +9,151 @@
 #include "ATen/core/TensorTypeIdRegistration.h"
 #include "ATen/core/LegacyTypeDispatch.h"
 #include "ATen/core/Backend.h"
+#include "ATen/core/context_base.h"
 
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/flags.h"
 #include "caffe2/core/logging.h"
 
-struct THTensor;
+// A global boolean variable to control whether we free memory when a Tensor
+// is shrinked to a smaller size. As a result, a Tensor is always going to
+// keep the memory allocated for its maximum capacity reshaped to so far.
+//
+// This parameter is respected "upper-case" methods which call Resize()
+// (e.g., CopyFrom, ResizeLike); it is NOT respected by Tensor::resize_
+// or ShrinkTo, both of which guarantee to never to free memory.
+CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
+
+// Since we can have high variance in blob memory allocated across different
+// inputs in the same run, we will shrink the blob only if the memory gain
+// is larger than this flag in bytes.  This only applies to functions which
+// respect caffe2_keep_on_shrink.
+CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+
+
+namespace caffe2 {
+
+// Defined by protobuf
+class DeviceOption;
+
+}
 
 namespace at {
 class Scalar;
 struct Type;
 struct Storage;
 class Tensor;
-} // namespace at
 
-namespace at {
+/**
+ * A utility function to convert vector<int> to vector<int64_t>.
+ */
+inline std::vector<int64_t> ToVectorint64_t(const std::vector<int>& src) {
+  return std::vector<int64_t>(src.begin(), src.end());
+}
+
+/**
+ * Return product of all dimensions starting from k
+ */
+inline int64_t size_from_dim_(int k, const std::vector<int64_t>& dims) {
+  int64_t r = 1;
+  for (size_t i = k; i < dims.size(); ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims up to k (not including dims[k])
+inline int64_t size_to_dim_(int k, const std::vector<int64_t>& dims) {
+  CAFFE_ENFORCE((unsigned)k <= dims.size());
+  int64_t r = 1;
+  for (int i = 0; i < k; ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims between k and l (not including dims[k] and dims[l])
+inline int64_t size_between_dim_(int k, int l, const std::vector<int64_t>& dims) {
+  CAFFE_ENFORCE((unsigned)l < dims.size());
+  int64_t r = 1;
+  if (k < l) {
+    for (int i = k + 1; i < l; ++i) {
+      r *= dims[i];
+    }
+  } else {
+    for (int i = l + 1; i < k; ++i) {
+      r *= dims[i];
+    }
+  }
+  return r;
+}
+
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+inline int canonical_axis_index_(int axis_index, int ndims) {
+  CAFFE_ENFORCE_GE(axis_index, -ndims);
+  CAFFE_ENFORCE_LT(axis_index, ndims);
+  if (axis_index < 0) {
+    return axis_index + ndims;
+  }
+  return axis_index;
+}
+
+/**
+ * The low-level representation of a tensor, which contains a storage
+ * (which contains the actual data) and metadata (e.g., sizes and strides)
+ * describing this data as a tensor.
+ *
+ * Some basic characteristics about our in-memory representation of
+ * tensors:
+ *
+ *  - It contains a pointer to a storage struct (Storage/StorageImpl)
+ *    which contains the pointer to the actual data and records the
+ *    data type and device of the view.  This allows multiple tensors
+ *    to alias the same underlying data, which allows to efficiently
+ *    implement differing *views* on a tensor.
+ *
+ *  - The tensor struct itself records view-specific metadata about
+ *    the tensor, e.g., sizes, strides and offset into storage.
+ *    Each view of a storage can have a different size or offset.
+ *
+ *  - This class is intrusively refcounted.  It is refcounted so that
+ *    we can support prompt deallocation of large tensors; it is
+ *    intrusively refcounted so that we can still perform reference
+ *    counted operations on raw pointers, which is often more convenient
+ *    when passing tensors across language boundaries.
+ */
 struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
   TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable);
   TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
 
+  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {
+    data_type_ = storage_ ? storage_.dtype() : caffe2::TypeMeta{};
+  }
+
+  TensorImpl(const TensorImpl&) = default;
+  TensorImpl& operator=(const TensorImpl&) = default;
+  TensorImpl(TensorImpl&&) = default;
+  TensorImpl& operator=(TensorImpl&&) = default;
+
   virtual void release_resources() override;
 
+  // TODO: Ideally, type_id() would be the *only* key we need to consult
+  // to do a dispatch, instead of having to grovel through three different
+  // variables.  Here's what's standing in the way:
+  //
+  //  - To eliminate ScalarType, we have to allocate a TensorTypeId for
+  //    each ScalarType+Backend combination, and then set it appropriately
+  //    when we initially allocate a TensorImpl.
+  //
+  //  - To eliminate is_variable, we have to allocate two classes of
+  //    TensorTypeId: ones that are variables, and ones that are not.
+  //    We may not want to eliminate this in the short term, because
+  //    hard-coding variable status into type_id() makes it more difficult
+  //    to do the "thread-local no_grad" trick (where we process Variables
+  //    "as if" they were non-Variables by setting a thread local variable.)
+  //
   Type & type() const {
     // NB: It's valid to use getTypeRaw here, because the TensorImpl
     // could not have been created without initializing the Type first.
@@ -44,9 +169,17 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   virtual const Storage& storage() const;
   friend struct Type;
 
+  /**
+   * The number of elements in a tensor.
+   *
+   * WARNING: If you are using the Caffe2 API, this method can sometimes
+   * return -1, specifically when a tensor has not yet had its storage
+   * allocated by calling mutable_data().  You can use this case to
+   * test if a tensor is initialized or not.
+   */
   virtual int64_t numel() const {
 #ifdef DEBUG
-    AT_ASSERT(compute_numel() == numel_);
+    AT_ASSERT(numel_ == -1 || compute_numel() == numel_);
 #endif
     return numel_;
   }
@@ -104,7 +237,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     AT_ASSERT(!is_variable());
     CAFFE_ENFORCE_WITH_CALLER(
         storage_.data() || numel_ == 0,
-        "The tensor is of non-zero shape, but its data is not allocated yet. "
+        "The tensor has a non-zero number of elements, but its data is not allocated yet. "
         "Caffe2 uses a lazy allocation, so you will need to call "
         "mutable_data() or raw_mutable_data() to actually allocate memory.");
     CAFFE_ENFORCE_WITH_CALLER(
@@ -114,7 +247,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         ", while tensor contains ",
         data_type_.name(),
         ". ");
-    return storage_.data<T>() + storage_offset_;
+    // We managed the type check ourselves
+    return storage_.unsafe_data<T>() + storage_offset_;
   }
 
   inline void* data() const {
@@ -228,5 +362,763 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
  private:
   TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable);
+
+ public:
+
+  /**
+   * The static context of a tensor intuitively represents the device
+   * type of a tensor; e.g., a CPU tensor is associated with the
+   * GetCPUStaticContext().  This method replaces the former Context template
+   * parameter which was previously used to identify the device type
+   * of a tensor.
+   */
+  at::BaseStaticContext* GetStaticContext() const {
+    auto device_type = GetDeviceType();
+    return ::caffe2::get_static_context(device_type);
+  }
+
+  /* @brief
+   * Create a context that has the same device_type
+   * as the tensor.
+   * Note that this doesn't support passing in argument
+   * TODO(jerryzh): move this to a global registry
+   * that can create context for us, and then eliminate
+   * this method.
+   */
+  std::unique_ptr<at::BaseContext> CreateContext() const {
+    return GetStaticContext()->CreateContext();
+  }
+
+  at::DeviceType GetDeviceType() const {
+    return storage_.device_type();
+  }
+
+  /**
+   * @brief Copies the data from a source tensor, with a contex provided to
+   * carry out the underlying memcpy operation.  This method respects
+   * caffe2_keep_on_shrink.
+   */
+  void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
+    if ((void*)&src == (void*)this) {
+      return;
+    }
+    if (data_type_ != src.meta()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          src.is_contiguous(),
+          "Right now only copy of contiguous source Tensor is supported.");
+      storage_ = at::Storage(GetDeviceType(), src.meta());
+      data_type_ = src.meta();
+    }
+    if (src.size() == -1) {
+      sizes_.clear();
+      numel_ = -1;
+      strides_.clear();
+      is_contiguous_ = true;
+      storage_.reset();
+      data_type_ = caffe2::TypeMeta();
+      return;
+    }
+    Resize(src.dims());
+    if (size() > 0) {
+      if (data_type_.copy()) {
+        CAFFE_ENFORCE(
+            GetDeviceType() == ::at::DeviceType::CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        CAFFE_ENFORCE(
+            src.GetDeviceType() == ::at::DeviceType::CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
+      } else {
+        // We'll need to use a non-CPU context to perform the copy if
+        // one of the context is not CPU since only non-CPU context
+        // knows how to copy between CPU and that context
+        if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) {
+          if (!context) {
+            src.CreateContext()->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          } else {
+            CAFFE_ENFORCE(
+                context->device_type() == src.GetDeviceType(),
+                "Type for provided context does not match the type of source");
+            context->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          }
+        } else {
+          // In case source context is CPU, and target context is non-CPU
+          // We'll have to create a Context from target and perform the
+          // copy using that context
+          CreateContext()->CopyBytesFromCPU(
+              nbytes(), src.raw_data(), raw_mutable_data());
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Extend the outer-most dimension of this tensor
+   *        to dimension of `num`.
+   */
+  void ExtendTo(int64_t num, float growthPct, at::BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(sizes_.size(), 1u);
+    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
+    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
+    Extend(num - sizes_[0], growthPct, context);
+  }
+
+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   */
+  void Extend(int64_t num, float growthPct, at::BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(sizes_.size(), 1u);
+    CAFFE_ENFORCE_GE_WITH_CALLER(
+        num, 0, "`num` must be non-negative for Extend");
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Extend is only supported for contiguous Tensor.");
+    auto newDims = sizes_;
+    newDims[0] += num;
+    if (!storage_.data()) {
+      Resize(newDims);
+      return;
+    }
+    auto newNumel = std::accumulate(
+        newDims.begin(),
+        newDims.end(),
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      sizes_ = newDims;
+      numel_ = newNumel;
+      return;
+    }
+    auto newCapacity = sizes_;
+    newCapacity[0] = std::max<size_t>(
+        newDims[0], std::ceil(sizes_[0] * (growthPct + 100) / 100));
+    auto oldData = std::move(storage_.data_ptr());
+    auto oldSize = numel_;
+    auto oldDims = sizes_;
+    Resize(newCapacity);
+    auto* newData = raw_mutable_data(data_type_);
+    CAFFE_ENFORCE(
+        context != nullptr, "Context must be provided to Extend the tensor");
+    context->CopyItemsSameDevice(
+        data_type_, oldSize, oldData.get(), newData);
+    reserved_ = true;
+    sizes_ = newDims;
+    numel_ = newNumel;
+  }
+
+  /**
+   * @brief Shrinks the outer-most dimension to given size, keeping the data.
+   *
+   * This method guarantees that no re-allocations are carried out, which means
+   * that the extra capacity after the end of the shrunk tensor is maintained.
+   * Notably, this function does NOT respect caffe2_keep_on_shrink.
+   */
+  void ShrinkTo(int64_t outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ShrinkTo is only supported on contiguous Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(sizes_.size() >= 1, "Tensor must be at least 1D");
+    CAFFE_ENFORCE_WITH_CALLER(
+        outer_dim <= sizes_[0],
+        "New outer dimension must be smaller than current.");
+    CAFFE_ENFORCE(
+        storage_.unique(),
+        "Can't call ShrinkTo on shared storage, please call Resize instead.");
+    sizes_[0] = outer_dim;
+    numel_ = std::accumulate(
+        sizes_.begin(),
+        sizes_.end(),
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
+  }
+
+  /**
+   * @brief Reserve space for the underlying tensor.
+   *
+   * This must be called after Resize(), since we only specify the first
+   * dimension This does not copy over the old data to the newly allocated space
+   */
+  template <class T>
+  void ReserveSpace(const T& outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ReserveSpace is only supported for contiguous Tensor.");
+    CAFFE_ENFORCE(
+        numel_ != -1, "size should be initialized before calling ReserveSpace");
+    CAFFE_ENFORCE(
+        storage_.unique(), "Can't call ReserveSpace on shared storage.");
+    auto newCapacity = sizes_;
+    newCapacity[0] = outer_dim;
+    auto newNumel = std::accumulate(
+        newCapacity.begin(),
+        newCapacity.end(),
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      return;
+    }
+    // Old data is discarded
+    storage_.data_ptr().clear();
+    auto oldSize = numel_;
+    auto oldDims = sizes_;
+    Resize(newCapacity);
+    // Allocate new memory but don't copy over the data
+    raw_mutable_data(data_type_);
+    sizes_ = oldDims;
+    numel_ = oldSize;
+    reserved_ = true;
+  }
+
+  /**
+   * @brief Resizes a tensor.
+   *
+   * Resize takes in a vector of ints specifying the dimensions of the tensor.
+   * You can pass in an empty vector to specify that it is a scalar (i.e.
+   * containing one single item).
+   *
+   * The underlying storage may be deleted after calling Resize: if the new
+   * shape leads to a different number of items in the tensor, the old memory
+   * is deleted and new memory will be allocated next time you call
+   * mutable_data(). However, if the shape is different but the total number of
+   * items is the same, the underlying storage is kept.
+   *
+   * This method respects caffe2_keep_on_shrink.  Consult the internal logic
+   * of this method to see exactly under what circumstances this flag matters.
+   */
+  template <typename... Ts>
+  void Resize(Ts... dim_source) {
+    bool is_init = numel_ == -1;
+    bool size_changed = SetDims(dim_source...);
+    if (size_changed) {
+      // If needed, we will free the data. the next mutable_data() call
+      // will create the data storage.
+      bool reset_tensor = false;
+      if (reserved_) {
+        // If tensor is reserved then don't claim its memeory unless capacity()
+        // is smaller than new size
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize();
+      } else {
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() ||
+            !caffe2::FLAGS_caffe2_keep_on_shrink ||
+            storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() >
+                static_cast<size_t>(caffe2::FLAGS_caffe2_max_keep_on_shrink_memory);
+      }
+
+      if (reset_tensor && !is_init) {
+        FreeMemory();
+      }
+    }
+  }
+
+  /**
+   * Resize the tensor like the source tensor. Note that this is just a
+   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
+   * This method respects caffe2_keep_on_shrink.
+   */
+  inline void ResizeLike(const TensorImpl& src_tensor) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        src_tensor.is_contiguous(),
+        "Right now ResizeLike is only supported for contiguous Tensor.");
+    // Note: need casting for different context types.
+    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
+      Resize(src_tensor.dims());
+    }
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  inline void Reshape(const std::vector<int64_t>& dims) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Reshape is only supported for contiguous Tensor.");
+    int64_t new_size = 1;
+    for (auto d : dims) {
+      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
+      new_size *= d;
+    }
+    CAFFE_ENFORCE_WITH_CALLER(
+        new_size == numel_,
+        "New size and old size are not equal. You cannot use Reshape, "
+        "but should use Resize."
+        // TODO(jiayq): remove the following warning after pending diffs
+        // stabilize.
+        " The old caffe2 mixes Reshape and Resize but this behavior has "
+        "been changed. If you find this error, most likely you will need "
+        "to change corresponding code from Reshape to Resize.");
+    sizes_ = dims;
+  }
+
+  inline void Reshape(const std::vector<int>& dims) {
+    Reshape(ToVectorint64_t(dims));
+  }
+
+  /**
+   * Release whatever memory the tensor was holding but keep size and type
+   * information. Subsequent call to mutable_data will trigger new memory
+   * allocation.
+   */
+  inline void FreeMemory() {
+    // We'll detach from the old Storage and create a new one
+    storage_ = at::Storage(storage_.device_type(), data_type_);
+    storage_offset_ = 0;
+  }
+
+  /**
+   * A utility function to print the debug string for the tensor. Note that this
+   * is very slow since it involves quite some string operations, so do not use
+   * it in your performance-critical code.
+   */
+  std::string DebugString() const {
+    std::stringstream ss;
+    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
+       << data_type_.name() << " and dimension (";
+    for (int d : sizes_) {
+      ss << d << ",";
+    }
+    ss << ").";
+    return ss.str();
+  }
+
+  /**
+   * @brief Shares the data with another tensor.
+   *
+   * To share data between two tensors, the sizes of the two tensors must be
+   * equal already. The reason we do not implicitly do a Resize to make the two
+   * tensors have the same shape is that we want to allow tensors of different
+   * shapes but the same number of items to still be able to share data. This
+   * allows one to e.g. have a n-dimensional Tensor and a flattened version
+   * sharing the same underlying storage.
+   *
+   * The source tensor should already have its data allocated.
+   */
+  void ShareData(const TensorImpl& src) {
+    // Right now, we are assuming the device_type are the same, since it is
+    // inherently the same in the non-templatized code. We should probably add
+    // an ENFORCE here which might affect perf a little bit.
+    CAFFE_ENFORCE_EQ_WITH_CALLER(
+        src.numel_,
+        numel_,
+        "Size mismatch - did you call reshape before sharing the data?");
+    // It is possible that the source tensor hasn't called mutable_data() yet,
+    // in which case ShareData() doesn't make much sense since we don't really
+    // know what to share yet.
+    CAFFE_ENFORCE_WITH_CALLER(
+        src.storage_.data() || src.numel_ == 0,
+        "Source tensor has no content and has size > 0");
+    // Finally, do sharing.
+    /* Since we create new Storage whenever we need to change data_type/capacity
+     * this still keeps the original semantics
+     */
+    storage_ = src.storage();
+    data_type_ = src.dtype();
+    storage_offset_ = src.storage_offset();
+  }
+
+  /**
+   * @brief Shares the data with an externally managed pointer.
+   *
+   * This is similar to ShareData() but the source is a pointer with an advanced
+   * deleter option. In default, no deletion takes place, and one needs to make
+   * sure that the external memory is deallocated only after the tensor finishes
+   * using it. If a Deleter object is passed in, when this tensor is reallocated
+   * or freed, the deleter function is going to be called.
+   */
+  template <typename T>
+  void
+  ShareExternalPointer(T* src, size_t capacity = 0, caffe2::MemoryDeleter d = nullptr) {
+    ShareExternalPointer((void*)src, caffe2::TypeMeta::Make<T>(), capacity, d);
+  }
+
+  template <typename T>
+  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
+    ShareExternalPointer(std::move(data_ptr), caffe2::TypeMeta::Make<T>(), capacity);
+  }
+
+  void ShareExternalPointer(
+      void* src,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity = 0,
+      caffe2::MemoryDeleter d = nullptr) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ShareExternalPointer is only supported for contiguos Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    ShareExternalPointer(
+        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
+  }
+
+  void ShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    if (!capacity) {
+      capacity = numel_ * data_type.itemsize();
+    }
+    if (storage_.unique()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "To share data with a raw pointer, you need to set shape first.");
+      storage_.UniqueStorageShareExternalPointer(
+          std::move(data_ptr), data_type, capacity);
+      data_type_ = data_type;
+      storage_offset_ = 0;
+    } else {
+      int64_t numel = capacity / data_type.itemsize();
+      // Create a new Storage
+      storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
+      data_type_ = data_type;
+      storage_offset_ = 0;
+    }
+  }
+
+  /**
+   * Returns a const raw void* pointer of the underlying storage. mutable_data()
+   * or raw_mutable_data() must have been called prior to this function call.
+   */
+  inline const void* raw_data() const {
+    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
+    return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * storage_.itemsize());
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. Since we will need
+   * to know the type of the data for allocation, a TypeMeta object is passed in
+   * to specify the necessary information. This is conceptually equivalent of
+   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
+   * the type T. This function differs from mutable_data<T>() in the sense that
+   * the type T can be specified during runtime via the TypeMeta object.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data(const caffe2::TypeMeta& meta) {
+    // For 0-size tensors it's fine to return any pointer (including nullptr)
+    if (data_type_ == meta && (storage_.data() || numel_ == 0)) {
+      return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
+    } else {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "Tensor is not initialized. You probably need to call Resize() "
+          "before calling mutable_data()");
+      bool had_special_dtor = data_type_.dtor() != nullptr;
+      storage_offset_ = 0;
+      if (storage_.unique()) {
+        storage_.set_dtype(meta);
+      } else {
+        if (data_type_ != meta) {
+          storage_ = at::Storage(storage_.device_type(), meta);
+        }
+      }
+      data_type_ = meta;
+
+      // We can reuse the existing buffer if the current data does not have
+      // a special destructor and the new data doesn't have a special
+      // constructor.
+      if (numel_ == 0 ||
+          (meta.ctor() == nullptr && !had_special_dtor &&
+           storage_.numel() >= numel_)) {
+        AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+        return storage_.data();
+      }
+      const at::Allocator* allocator = storage_.allocator();
+      // TODO: Get rid of StaticContext
+      CAFFE_ENFORCE(
+          allocator == nullptr,
+          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
+      if (meta.ctor()) {
+        // For types that need placement new, we will call it, as well as
+        // making sure that when the data is freed, it calls the right
+        // destruction procedure.
+        auto size = numel_;
+        auto dtor = data_type_.dtor();
+        void* ptr;
+        at::DeleterFnPtr deleter;
+        auto ptr_and_deleter = GetStaticContext()->New(
+            numel_ * storage_.itemsize()); // Removing this can get rid of
+                                           // InefficientStdFunctionContext
+        ptr = ptr_and_deleter.first;
+        deleter = ptr_and_deleter.second;
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr,
+            [size, dtor, deleter](void* local_ptr) -> void {
+              dtor(local_ptr, size);
+              deleter(local_ptr);
+            },
+            at::Device(storage_.device_type())));
+        data_type_.ctor()(storage_.data(), numel_);
+      } else {
+        // For fundamental type, new and delete is easier.
+        auto ptr_and_deleter =
+            GetStaticContext()->New(numel_ * storage_.itemsize());
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr_and_deleter.first,
+            ptr_and_deleter.second,
+            at::Device(storage_.device_type())));
+      }
+      storage_.set_numel(numel_);
+      AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+      return storage_.data();
+    }
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. This can only be
+   * used when you know for sure that the underlying storage of the tensor is
+   * already created via an earlier raw_mutable_data(meta) call or a
+   * mutable_data<T>() call.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data() {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type_.id() != caffe2::TypeIdentifier::uninitialized(),
+        "Calling raw_mutable_data() without meta, but the current meta is "
+        "of unknown type.");
+    return raw_mutable_data(data_type_);
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage.
+   *
+   * For fundamental types, we reuse possible existing storage if there
+   * is sufficient capacity.
+   */
+  template <typename T>
+  inline T* mutable_data() {
+    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
+      return static_cast<T*>(storage_.data()) + storage_offset_;
+    }
+    // Check it here statically - otherwise TypeMeta would throw the runtime
+    // error in attempt to invoke TypeMeta::ctor()
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "Tensor can't hold non-default-constructible types");
+    return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
+  }
+
+  /**
+   * Returns the number of dimensions of the data.
+   */
+  inline int ndim() const {
+    return sizes_.size();
+  }
+  /**
+   * Returns the size (i.e. the number of items) of the tensor.
+   */
+  inline int64_t size() const {
+    return numel_;
+  }
+  /**
+   * Return the number of bytes each item takes in the tensor.
+   */
+  inline size_t itemsize() const {
+    return storage_.itemsize();
+  }
+  /**
+   * Returns the total number of bytes of the storage.
+   *
+   * This is equivalent to calling size() * itemsize().
+   */
+  inline size_t nbytes() const {
+    return numel_ * itemsize();
+    ;
+  }
+
+  // NB: This capacity may also include available space
+  // in the storage BEFORE the tensor data, if storage_offset != 0
+  inline size_t capacity_nbytes() const {
+    return storage_.capacity();
+  }
+  /**
+   * Returns the dimensions of the tensor as a vector.
+   */
+  inline const std::vector<int64_t>& dims() const {
+    return sizes_;
+  }
+
+  inline int64_t size_from_dim(int k) const {
+    return size_from_dim_(k, sizes_);
+  }
+
+  inline int64_t size_to_dim(int k) const {
+    return size_to_dim_(k, sizes_);
+  }
+
+  inline int64_t size_between_dim(int k, int l) const {
+    return size_between_dim_(k, l, sizes_);
+  }
+
+  /**
+   * Returns the 'canonical' version of a (usually)  user-specified axis,
+   * allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param axis_index the axis index.
+   *        If 0 <= index < ndim(), return index.
+   *        If -ndim <= index <= -1, return (ndim() - (-index)),
+   *        e.g., the last axis index (ndim() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
+  inline int canonical_axis_index(int axis_index) const {
+    return canonical_axis_index_(axis_index, ndim());
+  }
+
+  /**
+   * Checks if the tensor content is of the given data type.
+   */
+  template <typename T>
+  inline bool IsType() const {
+    return storage_.IsType<T>();
+  }
+  /**
+   * Returns the TypeMeta object associated with the current data type.
+   */
+  inline const caffe2::TypeMeta& meta() const {
+    return data_type_;
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor in int.
+   *
+   * This function returns an int value instead of int64_t, which depending on
+   * the typedef could be int64. If you want int64 dim values, make sure you
+   * call dim() instead.
+   */
+  inline int dim32(const int i) const {
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast<int>(sizes_.size()), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    CAFFE_ENFORCE_LT_WITH_CALLER(sizes_[i], std::numeric_limits<int>::max());
+    return static_cast<int>(sizes_[i]);
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor. Note that the passed in index
+   * must be between 0 (inclusive) and the number of dimensions, otherwise
+   * this function will produce a fatal message.
+   */
+  inline int64_t dim(const int i) const {
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast<int>(sizes_.size()), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    return sizes_[i];
+  }
+
+  void ExtractDeviceOption(caffe2::DeviceOption* device) const {
+    auto* context = GetStaticContext();
+    CHECK(context);
+    context->ExtractDeviceOption(device, raw_data());
+  }
+
+ protected:
+  // we decide to keep reserved_ and it will
+  // live in Tensor after the split
+  // The logic is that if Extend() or ReserveSpace() were ever called,
+  // then subsequent Resize()s will not free up Storage.
+  bool reserved_ = false;
+
+ private:
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_integral<T>::value>::type>
+  bool SetDims(const std::vector<T>& src) {
+    auto old_numel = numel_;
+    sizes_.resize(src.size());
+    int64_t new_numel = 1;
+    for (size_t i = 0; i < src.size(); ++i) {
+      new_numel *= src[i];
+      sizes_[i] = src[i];
+    }
+    update_to_contiguous_strides();
+    numel_ = new_numel;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims() {
+    auto old_numel = numel_;
+    sizes_.resize(0);
+    update_to_contiguous_strides();
+    numel_ = 1;
+    return numel_ != old_numel;
+  }
+
+  // TODO(jiayq): maybe rewrite the following functions with initializer list.
+  // NVCC does not play well with initializer lists last time, but worth
+  // another shot.
+  bool SetDims(const int64_t d0) {
+    auto old_numel = numel_;
+    sizes_.resize(1);
+    sizes_[0] = d0;
+    update_to_contiguous_strides();
+    numel_ = d0;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1) {
+    auto old_numel = numel_;
+    sizes_.resize(2);
+    sizes_[0] = d0;
+    sizes_[1] = d1;
+    update_to_contiguous_strides();
+    numel_ = d0 * d1;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
+    auto old_numel = numel_;
+    sizes_.resize(3);
+    sizes_[0] = d0;
+    sizes_[1] = d1;
+    sizes_[2] = d2;
+    update_to_contiguous_strides();
+    numel_ = d0 * d1 * d2;
+    return numel_ != old_numel;
+  }
+
+  bool
+  SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) {
+    auto old_numel = numel_;
+    sizes_.resize(4);
+    sizes_[0] = d0;
+    sizes_[1] = d1;
+    sizes_[2] = d2;
+    sizes_[3] = d3;
+    update_to_contiguous_strides();
+    numel_ = d0 * d1 * d2 * d3;
+    return numel_ != old_numel;
+  }
+
+  inline void update_to_contiguous_strides() {
+    strides_.resize(sizes_.size());
+    if (ndim() > 0) {
+      int last_idx = ndim() - 1;
+      strides_[last_idx] = 1;
+      for (auto i = last_idx - 1; i >= 0; --i) {
+        strides_[i] = strides_[i + 1] * std::max<int64_t>(sizes_[i + 1], 1);
+      }
+    }
+    is_contiguous_ = true;
+  }
+
 };
 } // namespace at
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 1e4cac2788b560..b5e47ecfd042fc 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -9,7 +9,7 @@
 namespace caffe2 {
 
 class CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
-  UndefinedTensorImpl() : TensorImpl(at::Storage()){};
+  UndefinedTensorImpl() : TensorImpl(CPU){};
 
  public:
  // Without this, we get:
@@ -33,6 +33,8 @@ class CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
  * redirects API calls to TensorImpl;
  * Copying of Tensor results in sharing the same underlying implementation
  * object
+ *
+ * NB: See TensorImpl for documentation on these methods.
  */
 class CAFFE2_API Tensor final {
  protected:
@@ -286,7 +288,7 @@ class CAFFE2_API Tensor final {
     return impl_.get()->stride(dim);
   }
 
-  inline at::DimVector strides() {
+  inline at::IntList strides() {
     return impl_.get()->strides();
   }
 
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 53c812f55e297b..17d3b22083bbb1 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -5,909 +5,11 @@
 #include <ATen/core/context_base.h>
 #include <ATen/core/context_base.h>
 
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/logging.h"
-
-// A global boolean variable to control whether we free memory when a Tensor
-// is shrinked to a smaller size. As a result, a Tensor is always going to
-// keep the memory allocated for its maximum capacity reshaped to so far.
-CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
-
-// Since we can have high variance in blob memory allocated across different
-// inputs in the same run, we will shrink the blob only if the memory gain
-// is larger than this flag in bytes.
-CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
-
 namespace caffe2 {
-
-// Defined by protobuf
-class DeviceOption;
-
-/**
- * A utility function to convert vector<int> to vector<int64_t>.
- */
-inline std::vector<int64_t> ToVectorint64_t(const std::vector<int>& src) {
-  return std::vector<int64_t>(src.begin(), src.end());
-}
-
-/**
- * Return product of all dimensions starting from k
- */
-inline int64_t size_from_dim_(int k, const std::vector<int64_t>& dims) {
-  int64_t r = 1;
-  for (size_t i = k; i < dims.size(); ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims up to k (not including dims[k])
-inline int64_t size_to_dim_(int k, const std::vector<int64_t>& dims) {
-  CAFFE_ENFORCE((unsigned)k <= dims.size());
-  int64_t r = 1;
-  for (int i = 0; i < k; ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims between k and l (not including dims[k] and dims[l])
-inline int64_t size_between_dim_(int k, int l, const std::vector<int64_t>& dims) {
-  CAFFE_ENFORCE((unsigned)l < dims.size());
-  int64_t r = 1;
-  if (k < l) {
-    for (int i = k + 1; i < l; ++i) {
-      r *= dims[i];
-    }
-  } else {
-    for (int i = l + 1; i < k; ++i) {
-      r *= dims[i];
-    }
-  }
-  return r;
-}
-
-// Wrap around axis_index if it is negative, s.t., -1 is the last dim
-inline int canonical_axis_index_(int axis_index, int ndims) {
-  CAFFE_ENFORCE_GE(axis_index, -ndims);
-  CAFFE_ENFORCE_LT(axis_index, ndims);
-  if (axis_index < 0) {
-    return axis_index + ndims;
-  }
-  return axis_index;
-}
-
-/**
- * @brief TensorImpl is the implementation of a tensor and the basic class
- * in Caffe2 that stores a contiguous memory with its shape information.
- *
- * The TensorImpl class is essentially a wrapper around a device-specific memory
- * (the device is specified by the Context template argument), and deals with
- * the allocation and de-allocation of such memory. We make a simplified
- * assumption that the memory is always contiguous.
- */
-class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
- public:
-  TensorImpl() = delete;
-
-  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {
-    data_type_ = storage_ ? storage_.dtype() : TypeMeta{};
-  }
-
-  TensorImpl(const TensorImpl&) = default;
-  TensorImpl& operator=(const TensorImpl&) = default;
-  TensorImpl(TensorImpl&&) = default;
-  TensorImpl& operator=(TensorImpl&&) = default;
-
-  virtual ~TensorImpl() noexcept {}
-
-  /*
-   * Since we removed template from tensor, we now store a static
-   * context pointer in tensor, which indicates the type of the tensor.
-   */
-  at::BaseStaticContext* GetStaticContext() const {
-    auto device_type = GetDeviceType();
-    return get_static_context(device_type);
-  }
-
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us
-   */
-  std::unique_ptr<at::BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
-  }
-
-  at::DeviceType GetDeviceType() const {
-    return storage_.device_type();
-  }
-
-  /**
-   * @brief Copies the data from a source tensor, with a contex provided to
-   * carry out the underlying memcpy operation.
-   */
-  void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
-    if ((void*)&src == (void*)this) {
-      return;
-    }
-    if (data_type_ != src.meta()) {
-      CAFFE_ENFORCE_WITH_CALLER(
-          src.is_contiguous(),
-          "Right now only copy of contiguous source Tensor is supported.");
-      storage_ = at::Storage(GetDeviceType(), src.meta());
-      data_type_ = src.meta();
-    }
-    if (src.size() == -1) {
-      dims_.clear();
-      numel_ = -1;
-      strides_.clear();
-      is_contiguous_ = true;
-      storage_.reset();
-      data_type_ = TypeMeta();
-      return;
-    }
-    Resize(src.dims());
-    if (size() > 0) {
-      if (data_type_.copy()) {
-        CAFFE_ENFORCE(
-            GetDeviceType() == ::at::DeviceType::CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        CAFFE_ENFORCE(
-            src.GetDeviceType() == ::at::DeviceType::CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
-      } else {
-        // We'll need to use a non-CPU context to perform the copy if
-        // one of the context is not CPU since only non-CPU context
-        // knows how to copy between CPU and that context
-        if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) {
-          if (!context) {
-            src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          } else {
-            CAFFE_ENFORCE(
-                context->device_type() == src.GetDeviceType(),
-                "Type for provided context does not match the type of source");
-            context->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          }
-        } else {
-          // In case source context is CPU, and target context is non-CPU
-          // We'll have to create a Context from target and perform the
-          // copy using that context
-          CreateContext()->CopyBytesFromCPU(
-              nbytes(), src.raw_data(), raw_mutable_data());
-        }
-      }
-    }
-  }
-
-  /**
-   * @brief Extend the outer-most dimension of this tensor
-   *        to dimension of `num`.
-   */
-  void ExtendTo(int64_t num, float growthPct, at::BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
-    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
-    Extend(num - dims_[0], growthPct, context);
-  }
-
-  /**
-   * @brief Extends the outer-most dimension of this tensor by num elements,
-   * preserving the existing data.
-   *
-   * The underlying data may be reallocated in order to accommodate the new
-   * elements, in which case this tensors' capacity is grown at a factor of
-   * growthPct. This ensures that Extend runs on an amortized O(1) time
-   * complexity.
-   */
-  void Extend(int64_t num, float growthPct, at::BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(
-        num, 0, "`num` must be non-negative for Extend");
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now Extend is only supported for contiguous Tensor.");
-    auto newDims = dims_;
-    newDims[0] += num;
-    if (!storage_.data()) {
-      Resize(newDims);
-      return;
-    }
-    auto newNumel = std::accumulate(
-        newDims.begin(),
-        newDims.end(),
-        static_cast<int64_t>(1),
-        std::multiplies<int64_t>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      dims_ = newDims;
-      numel_ = newNumel;
-      return;
-    }
-    auto newCapacity = dims_;
-    newCapacity[0] = std::max<size_t>(
-        newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
-    auto oldData = std::move(storage_.data_ptr());
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    auto* newData = raw_mutable_data(data_type_);
-    CAFFE_ENFORCE(
-        context != nullptr, "Context must be provided to Extend the tensor");
-    context->CopyItemsSameDevice(
-        data_type_, oldSize, oldData.get(), newData);
-    reserved_ = true;
-    dims_ = newDims;
-    numel_ = newNumel;
-  }
-
-  /**
-   * @brief Shrinks the outer-most dimension to given size, keeping the data.
-   *
-   * This method guarantees that no re-allocations are carried out, which means
-   * that the extra capacity after the end of the shurnk tensor is maintained.
-   */
-  void ShrinkTo(int64_t outer_dim) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ShrinkTo is only supported on contiguous Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
-    CAFFE_ENFORCE_WITH_CALLER(
-        outer_dim <= dims_[0],
-        "New outer dimension must be smaller than current.");
-    CAFFE_ENFORCE(
-        storage_.unique(),
-        "Can't call ShrinkTo on shared storage, please call Resize instead.");
-    dims_[0] = outer_dim;
-    numel_ = std::accumulate(
-        dims_.begin(),
-        dims_.end(),
-        static_cast<int64_t>(1),
-        std::multiplies<int64_t>());
-  }
-
-  /**
-   * @brief Reserve space for the underlying tensor.
-   *
-   * This must be called after Resize(), since we only specify the first
-   * dimension This does not copy over the old data to the newly allocated space
-   */
-  template <class T>
-  void ReserveSpace(const T& outer_dim) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ReserveSpace is only supported for contiguous Tensor.");
-    CAFFE_ENFORCE(
-        numel_ != -1, "size should be initialized before calling ReserveSpace");
-    CAFFE_ENFORCE(
-        storage_.unique(), "Can't call ReserveSpace on shared storage.");
-    auto newCapacity = dims_;
-    newCapacity[0] = outer_dim;
-    auto newNumel = std::accumulate(
-        newCapacity.begin(),
-        newCapacity.end(),
-        static_cast<int64_t>(1),
-        std::multiplies<int64_t>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      return;
-    }
-    // Old data is discarded
-    storage_.data_ptr().clear();
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    // Allocate new memory but don't copy over the data
-    raw_mutable_data(data_type_);
-    dims_ = oldDims;
-    numel_ = oldSize;
-    reserved_ = true;
-  }
-
-  /**
-   * @brief Resizes a tensor.
-   *
-   * Resize takes in a vector of ints specifying the dimensions of the tensor.
-   * You can pass in an empty vector to specify that it is a scalar (i.e.
-   * containing one single item).
-   *
-   * The underlying storage may be deleted after calling Resize: if the new
-   * shape leads to a different number of items in the tensor, the old memory
-   * is deleted and new memory will be allocated next time you call
-   * mutable_data(). However, if the shape is different but the total number of
-   * items is the same, the underlying storage is kept.
-   */
-  template <typename... Ts>
-  void Resize(Ts... dim_source) {
-    bool is_init = numel_ == -1;
-    bool size_changed = SetDims(dim_source...);
-    if (size_changed) {
-      // If needed, we will free the data. the next mutable_data() call
-      // will create the data storage.
-      bool reset_tensor = false;
-      if (reserved_) {
-        // If tensor is reserved then don't claim its memeory unless capacity()
-        // is smaller than new size
-        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize();
-      } else {
-        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() ||
-            !FLAGS_caffe2_keep_on_shrink ||
-            storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() >
-                FLAGS_caffe2_max_keep_on_shrink_memory;
-      }
-
-      if (reset_tensor && !is_init) {
-        FreeMemory();
-      }
-    }
-  }
-
-  /**
-   * Resize the tensor like the source tensor. Note that this is just a
-   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
-   */
-  inline void ResizeLike(const TensorImpl& src_tensor) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        src_tensor.is_contiguous(),
-        "Right now ResizeLike is only supported for contiguous Tensor.");
-    // Note: need casting for different context types.
-    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
-      Resize(src_tensor.dims());
-    }
-  }
-
-  /**
-   * Resizes the tensor without touching underlying storage.
-   * This requires the total size of the tensor to remains constant.
-   */
-  inline void Reshape(const std::vector<int64_t>& dims) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now Reshape is only supported for contiguous Tensor.");
-    int64_t new_size = 1;
-    for (auto d : dims) {
-      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
-      new_size *= d;
-    }
-    CAFFE_ENFORCE_WITH_CALLER(
-        new_size == numel_,
-        "New size and old size are not equal. You cannot use Reshape, "
-        "but should use Resize."
-        // TODO(jiayq): remove the following warning after pending diffs
-        // stabilize.
-        " The old caffe2 mixes Reshape and Resize but this behavior has "
-        "been changed. If you find this error, most likely you will need "
-        "to change corresponding code from Reshape to Resize.");
-    dims_ = dims;
-  }
-
-  inline void Reshape(const std::vector<int>& dims) {
-    Reshape(ToVectorint64_t(dims));
-  }
-
-  /**
-   * Release whatever memory the tensor was holding but keep size and type
-   * information. Subsequent call to mutable_data will trigger new memory
-   * allocation.
-   */
-  inline void FreeMemory() {
-    // We'll detach from the old Storage and create a new one
-    storage_ = at::Storage(storage_.device_type(), data_type_);
-    storage_offset_ = 0;
-  }
-
-  /**
-   * A utility function to print the debug string for the tensor. Note that this
-   * is very slow since it involves quite some string operations, so do not use
-   * it in your performance-critical code.
-   */
-  std::string DebugString() const {
-    std::stringstream ss;
-    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
-       << data_type_.name() << " and dimension (";
-    for (int d : dims_) {
-      ss << d << ",";
-    }
-    ss << ").";
-    return ss.str();
-  }
-
-  /**
-   * @brief Shares the data with another tensor.
-   *
-   * To share data between two tensors, the sizes of the two tensors must be
-   * equal already. The reason we do not implicitly do a Resize to make the two
-   * tensors have the same shape is that we want to allow tensors of different
-   * shapes but the same number of items to still be able to share data. This
-   * allows one to e.g. have a n-dimensional Tensor and a flattened version
-   * sharing the same underlying storage.
-   *
-   * The source tensor should already have its data allocated.
-   */
-  void ShareData(const TensorImpl& src) {
-    // Right now, we are assuming the device_type are the same, since it is
-    // inherently the same in the non-templatized code. We should probably add
-    // an ENFORCE here which might affect perf a little bit.
-    CAFFE_ENFORCE_EQ_WITH_CALLER(
-        src.numel_,
-        numel_,
-        "Size mismatch - did you call reshape before sharing the data?");
-    // It is possible that the source tensor hasn't called mutable_data() yet,
-    // in which case ShareData() doesn't make much sense since we don't really
-    // know what to share yet.
-    CAFFE_ENFORCE_WITH_CALLER(
-        src.storage_.data() || src.numel_ == 0,
-        "Source tensor has no content and has size > 0");
-    // Finally, do sharing.
-    /* Since we create new Storage whenever we need to change data_type/capacity
-     * this still keeps the original semantics
-     */
-    storage_ = src.storage();
-    data_type_ = src.dtype();
-    storage_offset_ = src.storage_offset();
-  }
-
-  /**
-   * @brief Shares the data with an externally managed pointer.
-   *
-   * This is similar to ShareData() but the source is a pointer with an advanced
-   * deleter option. In default, no deletion takes place, and one needs to make
-   * sure that the external memory is deallocated only after the tensor finishes
-   * using it. If a Deleter object is passed in, when this tensor is reallocated
-   * or freed, the deleter function is going to be called.
-   */
-  template <typename T>
-  void
-  ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) {
-    ShareExternalPointer((void*)src, TypeMeta::Make<T>(), capacity, d);
-  }
-
-  template <typename T>
-  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
-    ShareExternalPointer(std::move(data_ptr), TypeMeta::Make<T>(), capacity);
-  }
-
-  void ShareExternalPointer(
-      void* src,
-      const TypeMeta& data_type,
-      size_t capacity = 0,
-      MemoryDeleter d = nullptr) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ShareExternalPointer is only supported for contiguos Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    ShareExternalPointer(
-        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
-  }
-
-  void ShareExternalPointer(
-      at::DataPtr&& data_ptr,
-      const TypeMeta& data_type,
-      size_t capacity) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    if (!capacity) {
-      capacity = numel_ * data_type.itemsize();
-    }
-    if (storage_.unique()) {
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "To share data with a raw pointer, you need to set shape first.");
-      storage_.UniqueStorageShareExternalPointer(
-          std::move(data_ptr), data_type, capacity);
-      data_type_ = data_type;
-      storage_offset_ = 0;
-    } else {
-      int64_t numel = capacity / data_type.itemsize();
-      // Create a new Storage
-      storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
-      data_type_ = data_type;
-      storage_offset_ = 0;
-    }
-  }
-
-  /**
-   * Returns a const raw void* pointer of the underlying storage. mutable_data()
-   * or raw_mutable_data() must have been called prior to this function call.
-   */
-  inline const void* raw_data() const {
-    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
-    return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * storage_.itemsize());
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage. mutable_data() or
-   * raw_mutable_data() must have been called prior to this function call, and
-   * the data type must be of the correct type. If you want to get a void*
-   * pointer instead, use raw_data().
-   */
-  template <typename T>
-  inline const T* data() const {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_.data() || numel_ == 0,
-        "The tensor is of non-zero shape, but its data is not allocated yet. "
-        "Caffe2 uses a lazy allocation, so you will need to call "
-        "mutable_data() or raw_mutable_data() to actually allocate memory.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        IsType<T>(),
-        "Tensor type mismatch, caller expects elements to be ",
-        TypeMeta::TypeName<T>(),
-        ", while tensor contains ",
-        data_type_.name(),
-        ". ");
-    return static_cast<T*>(storage_.data()) + storage_offset_;
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. Since we will need
-   * to know the type of the data for allocation, a TypeMeta object is passed in
-   * to specify the necessary information. This is conceptually equivalent of
-   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
-   * the type T. This function differs from mutable_data<T>() in the sense that
-   * the type T can be specified during runtime via the TypeMeta object.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data(const TypeMeta& meta) {
-    // For 0-size tensors it's fine to return any pointer (including nullptr)
-    if (data_type_ == meta && (storage_.data() || numel_ == 0)) {
-      return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
-    } else {
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "Tensor is not initialized. You probably need to call Resize() "
-          "before calling mutable_data()");
-      bool had_special_dtor = data_type_.dtor() != nullptr;
-      storage_offset_ = 0;
-      if (storage_.unique()) {
-        storage_.set_dtype(meta);
-      } else {
-        if (data_type_ != meta) {
-          storage_ = at::Storage(storage_.device_type(), meta);
-        }
-      }
-      data_type_ = meta;
-
-      // We can reuse the existing buffer if the current data does not have
-      // a special destructor and the new data doesn't have a special
-      // constructor.
-      if (numel_ == 0 ||
-          (meta.ctor() == nullptr && !had_special_dtor &&
-           storage_.numel() >= numel_)) {
-        AT_ASSERT(storage_offset_ == 0); // because we just reallocated
-        return storage_.data();
-      }
-      const at::Allocator* allocator = storage_.allocator();
-      // TODO: Get rid of StaticContext
-      CAFFE_ENFORCE(
-          allocator == nullptr,
-          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
-      if (meta.ctor()) {
-        // For types that need placement new, we will call it, as well as
-        // making sure that when the data is freed, it calls the right
-        // destruction procedure.
-        auto size = numel_;
-        auto dtor = data_type_.dtor();
-        void* ptr;
-        at::DeleterFnPtr deleter;
-        auto ptr_and_deleter = GetStaticContext()->New(
-            numel_ * storage_.itemsize()); // Removing this can get rid of
-                                           // InefficientStdFunctionContext
-        ptr = ptr_and_deleter.first;
-        deleter = ptr_and_deleter.second;
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr,
-            [size, dtor, deleter](void* local_ptr) -> void {
-              dtor(local_ptr, size);
-              deleter(local_ptr);
-            },
-            at::Device(storage_.device_type())));
-        data_type_.ctor()(storage_.data(), numel_);
-      } else {
-        // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter =
-            GetStaticContext()->New(numel_ * storage_.itemsize());
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr_and_deleter.first,
-            ptr_and_deleter.second,
-            at::Device(storage_.device_type())));
-      }
-      storage_.set_numel(numel_);
-      AT_ASSERT(storage_offset_ == 0); // because we just reallocated
-      return storage_.data();
-    }
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. This can only be
-   * used when you know for sure that the underlying storage of the tensor is
-   * already created via an earlier raw_mutable_data(meta) call or a
-   * mutable_data<T>() call.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data() {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type_.id() != TypeIdentifier::uninitialized(),
-        "Calling raw_mutable_data() without meta, but the current meta is "
-        "of unknown type.");
-    return raw_mutable_data(data_type_);
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage.
-   *
-   * For fundamental types, we reuse possible existing storage if there
-   * is sufficient capacity.
-   */
-  template <typename T>
-  inline T* mutable_data() {
-    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
-      return static_cast<T*>(storage_.data()) + storage_offset_;
-    }
-    // Check it here statically - otherwise TypeMeta would throw the runtime
-    // error in attempt to invoke TypeMeta::ctor()
-    static_assert(
-        std::is_default_constructible<T>::value,
-        "Tensor can't hold non-default-constructible types");
-    return static_cast<T*>(raw_mutable_data(TypeMeta::Make<T>()));
-  }
-
-  /**
-   * Returns the number of dimensions of the data.
-   */
-  inline int ndim() const {
-    return dims_.size();
-  }
-  /**
-   * Returns the size (i.e. the number of items) of the tensor.
-   */
-  inline int64_t size() const {
-    return numel_;
-  }
-  /**
-   * Return the number of bytes each item takes in the tensor.
-   */
-  inline size_t itemsize() const {
-    return storage_.itemsize();
-  }
-  /**
-   * Returns the total number of bytes of the storage.
-   *
-   * This is equivalent to calling size() * itemsize().
-   */
-  inline size_t nbytes() const {
-    return numel_ * itemsize();
-    ;
-  }
-
-  /**
-   * Returns the dimensions of the tensor as a vector.
-   */
-  inline const std::vector<int64_t>& dims() const {
-    return dims_;
-  }
-
-  inline int64_t size_from_dim(int k) const {
-    return size_from_dim_(k, dims_);
-  }
-
-  inline int64_t size_to_dim(int k) const {
-    return size_to_dim_(k, dims_);
-  }
-
-  inline int64_t size_between_dim(int k, int l) const {
-    return size_between_dim_(k, l, dims_);
-  }
-
-  /**
-   * Returns the 'canonical' version of a (usually)  user-specified axis,
-   * allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param axis_index the axis index.
-   *        If 0 <= index < ndim(), return index.
-   *        If -ndim <= index <= -1, return (ndim() - (-index)),
-   *        e.g., the last axis index (ndim() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int canonical_axis_index(int axis_index) const {
-    return canonical_axis_index_(axis_index, ndim());
-  }
-
-  inline int64_t stride(int64_t dim) const {
-#ifndef NDEBUG
-    // TODO: dim wrapping?
-    CAFFE_ENFORCE_LT_WITH_CALLER(dim, strides_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(
-        dim, 0, "Cannot have negative dimension index");
-#endif
-    return strides_[dim];
-  }
-
-  // TODO: Change to ArrayRef later
-  inline at::DimVector strides() {
-    return strides_;
-  }
-
-  inline bool is_contiguous() const {
-    return is_contiguous_;
-  }
-
-  /**
-   * Checks if the tensor content is of the given data type.
-   */
-  template <typename T>
-  inline bool IsType() const {
-    return storage_.IsType<T>();
-  }
-  /**
-   * Returns the TypeMeta object associated with the current data type.
-   */
-  inline const TypeMeta& meta() const {
-    return data_type_;
-  }
-
-  inline const TypeMeta& dtype() const {
-    return data_type_;
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor in int.
-   *
-   * This function returns an int value instead of int64_t, which depending on
-   * the typedef could be int64. If you want int64 dim values, make sure you
-   * call dim() instead.
-   */
-  inline int dim32(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits<int>::max());
-    return static_cast<int>(dims_[i]);
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor. Note that the passed in index
-   * must be between 0 (inclusive) and the number of dimensions, otherwise
-   * this function will produce a fatal message.
-   */
-  inline int64_t dim(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    return dims_[i];
-  }
-
-  void ExtractDeviceOption(DeviceOption* device) const {
-    auto* context = GetStaticContext();
-    CHECK(context);
-    context->ExtractDeviceOption(device, raw_data());
-  }
-
-  const at::Storage& storage() {
-    return storage_;
-  }
-
-  const at::Storage& storage() const {
-    return storage_;
-  }
-
-  int64_t storage_offset() const {
-    return storage_offset_;
-  }
-
- protected:
-  // TODO: change to DimVector
-  std::vector<int64_t> dims_; // sizes_
-  at::DimVector strides_;
-  int64_t numel_ = -1; // numel_
-  bool is_contiguous_ = true;
-  // we decide to keep reserved_ and it will
-  // live in Tensor after the split
-  // The logic is that if Extend() or ReserveSpace() were ever called,
-  // then subsequent Resize()s will not free up Storage.
-  bool reserved_ = false;
-  at::Storage storage_;
-  int64_t storage_offset_ = 0;
-  TypeMeta data_type_;
-
- private:
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_integral<T>::value>::type>
-  bool SetDims(const std::vector<T>& src) {
-    auto old_numel = numel_;
-    dims_.resize(src.size());
-    int64_t new_numel = 1;
-    for (size_t i = 0; i < src.size(); ++i) {
-      new_numel *= src[i];
-      dims_[i] = src[i];
-    }
-    update_strides();
-    numel_ = new_numel;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims() {
-    auto old_numel = numel_;
-    dims_.resize(0);
-    update_strides();
-    numel_ = 1;
-    return numel_ != old_numel;
-  }
-
-  // TODO(jiayq): maybe rewrite the following functions with initializer list.
-  // NVCC does not play well with initializer lists last time, but worth
-  // another shot.
-  bool SetDims(const int64_t d0) {
-    auto old_numel = numel_;
-    dims_.resize(1);
-    dims_[0] = d0;
-    update_strides();
-    numel_ = d0;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const int64_t d0, const int64_t d1) {
-    auto old_numel = numel_;
-    dims_.resize(2);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    update_strides();
-    numel_ = d0 * d1;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
-    auto old_numel = numel_;
-    dims_.resize(3);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    update_strides();
-    numel_ = d0 * d1 * d2;
-    return numel_ != old_numel;
-  }
-
-  bool
-  SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) {
-    auto old_numel = numel_;
-    dims_.resize(4);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    dims_[3] = d3;
-    update_strides();
-    numel_ = d0 * d1 * d2 * d3;
-    return numel_ != old_numel;
-  }
-
-  inline void update_strides() {
-    strides_.resize(dims_.size());
-    if (ndim() > 0) {
-      int last_idx = ndim() - 1;
-      strides_[last_idx] = 1;
-      for (auto i = last_idx - 1; i >= 0; --i) {
-        strides_[i] = strides_[i + 1] * std::max<int64_t>(dims_[i + 1], 1);
-      }
-    }
-    is_contiguous_ = true;
-  }
-};
-
+  using at::ToVectorint64_t;
+  using at::size_from_dim_;
+  using at::size_to_dim_;
+  using at::size_between_dim_;
+  using at::canonical_axis_index_;
+  using at::TensorImpl;
 }

From e8cb6cb9d21dbf816be72a11febc843ca2d0d2e1 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Thu, 27 Sep 2018 17:31:14 -0700
Subject: [PATCH 21/82] Fix some symbolics for ReduceSum, GE, LE (#12123)

Summary:
reduce sum negative indices turn to positive as caffe2 not supporting it. GE/LE symbolic operand order is wrong..
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12123

Reviewed By: houseroad

Differential Revision: D10095467

Pulled By: wanchaol

fbshipit-source-id: eb20248de5531c25040ee68b89bd18743498138d
---
 test/onnx/expect/TestOperators.test_ge.expect | 2 +-
 test/onnx/expect/TestOperators.test_le.expect | 2 +-
 torch/onnx/symbolic.py                        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
index 63f1f3cc563951..f11abcc7e6ef3b 100644
--- a/test/onnx/expect/TestOperators.test_ge.expect
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -3,8 +3,8 @@ producer_name: "pytorch"
 producer_version: "0.4"
 graph {
   node {
-    input: "1"
     input: "0"
+    input: "1"
     output: "2"
     op_type: "Less"
   }
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
index fb36f3449f2664..b50002eacbaf3b 100644
--- a/test/onnx/expect/TestOperators.test_le.expect
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -3,8 +3,8 @@ producer_name: "pytorch"
 producer_version: "0.4"
 graph {
   node {
-    input: "1"
     input: "0"
+    input: "1"
     output: "2"
     op_type: "Greater"
   }
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 91de860e2c68ee..eae2e7fe2cdab3 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -633,12 +633,12 @@ def lt(g, input, other):
 
 def ge(g, input, other):
     other = _maybe_get_scalar(other)
-    return g.op("Not", lt(g, _if_scalar_type_as(g, other, input), input))
+    return g.op("Not", lt(g, input, _if_scalar_type_as(g, other, input)))
 
 
 def le(g, input, other):
     other = _maybe_get_scalar(other)
-    return g.op("Not", gt(g, _if_scalar_type_as(g, other, input), input))
+    return g.op("Not", gt(g, input, _if_scalar_type_as(g, other, input)))
 
 
 @parse_args('v', 'i')

From c5fc2f1105de9966f1d1628387ace2ca8866471b Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:00 -0700
Subject: [PATCH 22/82] Merge UndefinedTensorImpl.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11972

Reviewed By: gchanan, Yangqing, jerryzh168

Differential Revision: D9995633

fbshipit-source-id: 6b4645c9d4bb0bc4301cd4bcfa76cf85331b8379
---
 caffe2/core/tensor.cc |  2 --
 caffe2/core/tensor.h  | 21 ++-------------------
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index caa0ba9ea55f49..0e531c83fcb7ad 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -6,8 +6,6 @@ namespace caffe2 {
 
 CAFFE_DEFINE_KNOWN_TYPE(Tensor);
 
-UndefinedTensorImpl UndefinedTensorImpl::singleton_;
-
 TensorPrinter::TensorPrinter(
     const std::string& tensor_name,
     const std::string& file_name,
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index b5e47ecfd042fc..8f49e41706b882 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -5,28 +5,11 @@
 #include "caffe2/core/tensor_impl.h"
 
 #include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/UndefinedTensorImpl.h>
 
 namespace caffe2 {
 
-class CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
-  UndefinedTensorImpl() : TensorImpl(CPU){};
-
- public:
- // Without this, we get:
- //  error: identifier "at::UndefinedTensor::_singleton" is undefined in device code
- // (ostensibly because the constexpr tricks MSVC into trying to compile this
- // function for device as well).
-#ifdef _WIN32
- static inline TensorImpl * singleton() {
-#else
- static constexpr inline TensorImpl * singleton() {
-#endif
-    return &singleton_;
-  }
-
- private:
-  static UndefinedTensorImpl singleton_;
-};
+using at::UndefinedTensorImpl;
 
 /**
  * @brief Tensor class holds a shared pointer to the implementation TensorImpl,

From 6a2dbc98084c7913d18dc8b34423f0b932e5ba2b Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:02 -0700
Subject: [PATCH 23/82] Rename TensorImpl::GetDeviceType to device_type, and
 properly test if is_variable

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12087

Reviewed By: jerryzh168

Differential Revision: D10050781

fbshipit-source-id: 0b6c9d7caf3b1000691f86fcc7f2ef203936a29f
---
 aten/src/ATen/core/TensorImpl.h | 28 ++++++++++++++--------------
 caffe2/core/tensor.h            |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 826fa28fe6e229..1e7de709cc9571 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -365,6 +365,11 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
  public:
 
+  at::DeviceType device_type() const {
+    AT_ASSERT(!is_variable());
+    return storage_.device_type();
+  }
+
   /**
    * The static context of a tensor intuitively represents the device
    * type of a tensor; e.g., a CPU tensor is associated with the
@@ -373,8 +378,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * of a tensor.
    */
   at::BaseStaticContext* GetStaticContext() const {
-    auto device_type = GetDeviceType();
-    return ::caffe2::get_static_context(device_type);
+    return ::caffe2::get_static_context(device_type());
   }
 
   /* @brief
@@ -389,10 +393,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return GetStaticContext()->CreateContext();
   }
 
-  at::DeviceType GetDeviceType() const {
-    return storage_.device_type();
-  }
-
   /**
    * @brief Copies the data from a source tensor, with a contex provided to
    * carry out the underlying memcpy operation.  This method respects
@@ -406,7 +406,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       CAFFE_ENFORCE_WITH_CALLER(
           src.is_contiguous(),
           "Right now only copy of contiguous source Tensor is supported.");
-      storage_ = at::Storage(GetDeviceType(), src.meta());
+      storage_ = at::Storage(device_type(), src.meta());
       data_type_ = src.meta();
     }
     if (src.size() == -1) {
@@ -422,26 +422,26 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     if (size() > 0) {
       if (data_type_.copy()) {
         CAFFE_ENFORCE(
-            GetDeviceType() == ::at::DeviceType::CPU,
+            device_type() == ::at::DeviceType::CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
         CAFFE_ENFORCE(
-            src.GetDeviceType() == ::at::DeviceType::CPU,
+            src.device_type() == ::at::DeviceType::CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
         data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
       } else {
         // We'll need to use a non-CPU context to perform the copy if
         // one of the context is not CPU since only non-CPU context
         // knows how to copy between CPU and that context
-        if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) {
+        if (src.device_type() != ::at::DeviceType::CPU || device_type() == ::at::DeviceType::CPU) {
           if (!context) {
             src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+                nbytes(), src.raw_data(), raw_mutable_data(), device_type());
           } else {
             CAFFE_ENFORCE(
-                context->device_type() == src.GetDeviceType(),
+                context->device_type() == src.device_type(),
                 "Type for provided context does not match the type of source");
             context->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+                nbytes(), src.raw_data(), raw_mutable_data(), device_type());
           }
         } else {
           // In case source context is CPU, and target context is non-CPU
@@ -757,7 +757,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         "To share with a raw external pointer you need to pass in an "
         "initialized data_type(TypeMeta).");
     ShareExternalPointer(
-        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
+        at::DataPtr(src, src, d, device_type()), data_type, capacity);
   }
 
   void ShareExternalPointer(
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 8f49e41706b882..2354ceb3ab6c47 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -120,7 +120,7 @@ class CAFFE2_API Tensor final {
   }
 
   DeviceType GetDeviceType() const {
-    return impl_.get()->GetDeviceType();
+    return impl_->device_type();
   }
 
   void CopyFrom(const Tensor& src, BaseContext* context = nullptr) const {

From 00c6fb16e70f001f2c2560f95dcc0dbb3d460057 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:04 -0700
Subject: [PATCH 24/82] Move ExtendTo to caffe2::Tensor from TensorImpl

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12089

Reviewed By: jerryzh168

Differential Revision: D10050859

fbshipit-source-id: 843067aacfa2a519657220bc39a0f499582a48a4
---
 aten/src/ATen/core/TensorImpl.h | 11 -----------
 caffe2/core/tensor.h            |  9 ++++++++-
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 1e7de709cc9571..0f44afee22ab6d 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -454,17 +454,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
-  /**
-   * @brief Extend the outer-most dimension of this tensor
-   *        to dimension of `num`.
-   */
-  void ExtendTo(int64_t num, float growthPct, at::BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(sizes_.size(), 1u);
-    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
-    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
-    Extend(num - sizes_[0], growthPct, context);
-  }
-
   /**
    * @brief Extends the outer-most dimension of this tensor by num elements,
    * preserving the existing data.
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 2354ceb3ab6c47..331e26713d6add 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -127,8 +127,15 @@ class CAFFE2_API Tensor final {
     impl_.get()->CopyFrom(*src.impl_.get(), context);
   }
 
+  /**
+   * @brief Extend the outer-most dimension of this tensor
+   *        to dimension of `num`.
+   */
   void ExtendTo(int64_t num, float growthPct, BaseContext* context) const {
-    impl_.get()->ExtendTo(num, growthPct, context);
+    CAFFE_ENFORCE_GE_WITH_CALLER(impl_->dim(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
+    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
+    Extend(num - impl_->size(0), growthPct, context);
   }
 
   void Extend(int64_t num, float growthPct, BaseContext* context) const {

From dd73d576436fce03a6ca2517f78ac6a21af5d5ef Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:06 -0700
Subject: [PATCH 25/82] Move TensorImpl::ShrinkTo to caffe2::Tensor (#12090)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12090

This is a slight pessimization because we need to do a
full recompute of is_contiguous(), even though a modification
of dim-0 is guaranteed to preserve contiguity.

Reviewed By: jerryzh168

Differential Revision: D10050905

fbshipit-source-id: b99233e21c9f4275b0db6e76740462e5430ce152
---
 aten/src/ATen/core/TensorImpl.h | 26 --------------------------
 caffe2/core/tensor.h            | 19 ++++++++++++++++++-
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 0f44afee22ab6d..18d86a888cce76 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -503,32 +503,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     numel_ = newNumel;
   }
 
-  /**
-   * @brief Shrinks the outer-most dimension to given size, keeping the data.
-   *
-   * This method guarantees that no re-allocations are carried out, which means
-   * that the extra capacity after the end of the shrunk tensor is maintained.
-   * Notably, this function does NOT respect caffe2_keep_on_shrink.
-   */
-  void ShrinkTo(int64_t outer_dim) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ShrinkTo is only supported on contiguous Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(sizes_.size() >= 1, "Tensor must be at least 1D");
-    CAFFE_ENFORCE_WITH_CALLER(
-        outer_dim <= sizes_[0],
-        "New outer dimension must be smaller than current.");
-    CAFFE_ENFORCE(
-        storage_.unique(),
-        "Can't call ShrinkTo on shared storage, please call Resize instead.");
-    sizes_[0] = outer_dim;
-    numel_ = std::accumulate(
-        sizes_.begin(),
-        sizes_.end(),
-        static_cast<int64_t>(1),
-        std::multiplies<int64_t>());
-  }
-
   /**
    * @brief Reserve space for the underlying tensor.
    *
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 331e26713d6add..0c0f8b6c38177e 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -142,8 +142,25 @@ class CAFFE2_API Tensor final {
     impl_.get()->Extend(num, growthPct, context);
   }
 
+  /**
+   * @brief Shrinks the outer-most dimension to given size, keeping the data.
+   *
+   * This method guarantees that no re-allocations are carried out, which means
+   * that the extra capacity after the end of the shrunk tensor is maintained.
+   * Notably, this function does NOT respect caffe2_keep_on_shrink.
+   */
   void ShrinkTo(int64_t outer_dim) const {
-    impl_.get()->ShrinkTo(outer_dim);
+    CAFFE_ENFORCE_WITH_CALLER(
+        impl_->is_contiguous(),
+        "Right now ShrinkTo is only supported on contiguous Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(impl_->dim() >= 1, "Tensor must be at least 1D");
+    CAFFE_ENFORCE_WITH_CALLER(
+        outer_dim <= impl_->size(0),
+        "New outer dimension must be smaller than current.");
+    CAFFE_ENFORCE(
+        impl_->storage().unique(),
+        "Can't call ShrinkTo on shared storage, please call Resize instead.");
+    impl_.get()->set_size(0, outer_dim);
   }
 
   template <class T>

From d02478e60774761a97628aa407863492fb9bae68 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:11 -0700
Subject: [PATCH 26/82] Move TensorImpl::ResizeLike to caffe2::Tensor

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12091

Reviewed By: jerryzh168

Differential Revision: D10051012

fbshipit-source-id: 772ecd2e377f7d4e1ae510c1f647f6c8b71e5a57
---
 aten/src/ATen/core/TensorImpl.h | 15 ---------------
 caffe2/core/tensor.h            | 12 +++++++++++-
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 18d86a888cce76..abec1b45548704 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -581,21 +581,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
-  /**
-   * Resize the tensor like the source tensor. Note that this is just a
-   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
-   * This method respects caffe2_keep_on_shrink.
-   */
-  inline void ResizeLike(const TensorImpl& src_tensor) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        src_tensor.is_contiguous(),
-        "Right now ResizeLike is only supported for contiguous Tensor.");
-    // Note: need casting for different context types.
-    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
-      Resize(src_tensor.dims());
-    }
-  }
-
   /**
    * Resizes the tensor without touching underlying storage.
    * This requires the total size of the tensor to remains constant.
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 0c0f8b6c38177e..6a1b0123f42543 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -173,8 +173,18 @@ class CAFFE2_API Tensor final {
     impl_.get()->Resize(dim_source...);
   }
 
+  /**
+   * Resize the tensor like the source tensor. Note that this is just a
+   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
+   * This method respects caffe2_keep_on_shrink.
+   */
   inline void ResizeLike(const Tensor& src_tensor) const {
-    impl_.get()->ResizeLike(*src_tensor.impl_.get());
+    CAFFE_ENFORCE_WITH_CALLER(
+        src_tensor.is_contiguous(),
+        "Right now ResizeLike is only supported for contiguous Tensor.");
+    if (impl_ != src_tensor.impl_) {
+      impl_.get()->Resize(src_tensor.dims());
+    }
   }
 
   inline void Reshape(const vector<int64_t>& dims) const {

From 8c533c2c90fc495b0dbd93105e0d93c85ff66a99 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:13 -0700
Subject: [PATCH 27/82] Fix bug where Reshape() trashes strides.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12092

Reviewed By: jerryzh168

Differential Revision: D10051005

fbshipit-source-id: c36d1c8d12fb41baf8d1a1a9f38776deeff242de
---
 aten/src/ATen/core/TensorImpl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index abec1b45548704..2d0dfd55dc0999 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -604,6 +604,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         "been changed. If you find this error, most likely you will need "
         "to change corresponding code from Reshape to Resize.");
     sizes_ = dims;
+    update_to_contiguous_strides();
   }
 
   inline void Reshape(const std::vector<int>& dims) {

From b0e48aa19773d2ec93ffbef7affb83048d1b255c Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:15 -0700
Subject: [PATCH 28/82] Move TensorImpl::Reshape(vector<int>) to caffe2::Tensor

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12094

Reviewed By: jerryzh168

Differential Revision: D10051079

fbshipit-source-id: 87fb91f31c33ce9b64c4654e79e0131ae391cd78
---
 aten/src/ATen/core/TensorImpl.h | 4 ----
 caffe2/core/tensor.h            | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 2d0dfd55dc0999..673b6b0f521755 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -607,10 +607,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     update_to_contiguous_strides();
   }
 
-  inline void Reshape(const std::vector<int>& dims) {
-    Reshape(ToVectorint64_t(dims));
-  }
-
   /**
    * Release whatever memory the tensor was holding but keep size and type
    * information. Subsequent call to mutable_data will trigger new memory
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 6a1b0123f42543..3ac734251a55fb 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -192,7 +192,7 @@ class CAFFE2_API Tensor final {
   }
 
   inline void Reshape(const vector<int>& dims) const {
-    impl_.get()->Reshape(dims);
+    impl_.get()->Reshape(ToVectorint64_t(dims));
   }
 
   inline void FreeMemory() const {

From 976a9e04542071f62b786e1c391ac9c5e5aec7fb Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:17 -0700
Subject: [PATCH 29/82] Move TensorImpl::DebugString() to caffe2::Tensor

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12095

Reviewed By: jerryzh168

Differential Revision: D10051078

fbshipit-source-id: f56b6fc5d1cb8ae4b636e88efe607fe65cc1d7a0
---
 aten/src/ATen/core/TensorImpl.h | 16 ----------------
 caffe2/core/tensor.h            | 14 +++++++++++++-
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 673b6b0f521755..d02a50047a7713 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -618,22 +618,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     storage_offset_ = 0;
   }
 
-  /**
-   * A utility function to print the debug string for the tensor. Note that this
-   * is very slow since it involves quite some string operations, so do not use
-   * it in your performance-critical code.
-   */
-  std::string DebugString() const {
-    std::stringstream ss;
-    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
-       << data_type_.name() << " and dimension (";
-    for (int d : sizes_) {
-      ss << d << ",";
-    }
-    ss << ").";
-    return ss.str();
-  }
-
   /**
    * @brief Shares the data with another tensor.
    *
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 3ac734251a55fb..4996c738c9172a 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -199,8 +199,20 @@ class CAFFE2_API Tensor final {
     impl_.get()->FreeMemory();
   }
 
+  /**
+   * A utility function to print the debug string for the tensor. Note that this
+   * is very slow since it involves quite some string operations, so do not use
+   * it in your performance-critical code.
+   */
   string DebugString() const {
-    return impl_.get()->DebugString();
+    std::stringstream ss;
+    ss << "A Tensor of item size " << impl_->storage().itemsize() << " and type "
+       << impl_->dtype().name() << " and dimension (";
+    for (int d : impl_->sizes()) {
+      ss << d << ",";
+    }
+    ss << ").";
+    return ss.str();
   }
 
   // NB: a.swap(b) is not equivalent to std::swap(a, b);

From 2021b26bcb1514b6e3b58fb86f1c252788a55d46 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:20 -0700
Subject: [PATCH 30/82] Move TensorImpl::ShareExternalPointer helper overloads
 to caffe2::Tensor

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12096

Reviewed By: jerryzh168

Differential Revision: D10051126

fbshipit-source-id: a9b95d00512a0b4e6339d4f3f0bb180dd0c79247
---
 aten/src/ATen/core/TensorImpl.h | 36 ---------------------------------
 caffe2/core/tensor.h            | 25 +++++++++++++++++++----
 2 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index d02a50047a7713..938815201c406d 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -653,42 +653,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     storage_offset_ = src.storage_offset();
   }
 
-  /**
-   * @brief Shares the data with an externally managed pointer.
-   *
-   * This is similar to ShareData() but the source is a pointer with an advanced
-   * deleter option. In default, no deletion takes place, and one needs to make
-   * sure that the external memory is deallocated only after the tensor finishes
-   * using it. If a Deleter object is passed in, when this tensor is reallocated
-   * or freed, the deleter function is going to be called.
-   */
-  template <typename T>
-  void
-  ShareExternalPointer(T* src, size_t capacity = 0, caffe2::MemoryDeleter d = nullptr) {
-    ShareExternalPointer((void*)src, caffe2::TypeMeta::Make<T>(), capacity, d);
-  }
-
-  template <typename T>
-  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
-    ShareExternalPointer(std::move(data_ptr), caffe2::TypeMeta::Make<T>(), capacity);
-  }
-
-  void ShareExternalPointer(
-      void* src,
-      const caffe2::TypeMeta& data_type,
-      size_t capacity = 0,
-      caffe2::MemoryDeleter d = nullptr) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ShareExternalPointer is only supported for contiguos Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    ShareExternalPointer(
-        at::DataPtr(src, src, d, device_type()), data_type, capacity);
-  }
-
   void ShareExternalPointer(
       at::DataPtr&& data_ptr,
       const caffe2::TypeMeta& data_type,
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 4996c738c9172a..f84efb8b1dbbcd 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -227,25 +227,42 @@ class CAFFE2_API Tensor final {
     impl_.get()->ShareData(*src.impl_.get());
   }
 
+  /**
+   * @brief Shares the data with an externally managed pointer.
+   *
+   * This is similar to ShareData() but the source is a pointer with an advanced
+   * deleter option. In default, no deletion takes place, and one needs to make
+   * sure that the external memory is deallocated only after the tensor finishes
+   * using it. If a Deleter object is passed in, when this tensor is reallocated
+   * or freed, the deleter function is going to be called.
+   */
   template <typename T>
   void ShareExternalPointer(
       T* src,
       size_t capacity = 0,
       MemoryDeleter d = nullptr) const {
-    impl_.get()->ShareExternalPointer<T>(src, capacity, d);
+    ShareExternalPointer((void*)src, caffe2::TypeMeta::Make<T>(), capacity, d);
   }
 
   template <typename T>
   void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const {
-    impl_.get()->ShareExternalPointer<T>(std::move(data_ptr), capacity);
+    ShareExternalPointer(std::move(data_ptr), caffe2::TypeMeta::Make<T>(), capacity);
   }
 
   void ShareExternalPointer(
       void* src,
-      const TypeMeta& meta,
+      const TypeMeta& data_type,
       size_t capacity = 0,
       MemoryDeleter d = nullptr) const {
-    impl_.get()->ShareExternalPointer(src, meta, capacity, d);
+    CAFFE_ENFORCE_WITH_CALLER(
+        impl_->is_contiguous(),
+        "Right now ShareExternalPointer is only supported for contiguous Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    impl_.get()->ShareExternalPointer(
+        at::DataPtr(src, src, d, impl_->device_type()), data_type, capacity);
   }
 
   void ShareExternalPointer(

From a86a61b004889e30563f184f67867788c6dba14e Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 18:29:22 -0700
Subject: [PATCH 31/82] Implement caffe2::Tensor::raw_data() in terms of data()

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12097

Reviewed By: jerryzh168

Differential Revision: D10051202

fbshipit-source-id: b4b61869363a606ab465d1500558226efae30d06
---
 aten/src/ATen/core/TensorImpl.h | 37 ++++++---------------------------
 caffe2/core/tensor.h            | 22 ++++++++++++++++++--
 2 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 938815201c406d..24b5789ee5ca0f 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -253,6 +253,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
   inline void* data() const {
     AT_ASSERT(!is_variable());
+    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
     return static_cast<void*>(
         static_cast<char*>(storage_.data()) +
         data_type_.itemsize() * storage_offset_);
@@ -427,7 +428,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         CAFFE_ENFORCE(
             src.device_type() == ::at::DeviceType::CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
+        data_type_.copy()(src.data(), raw_mutable_data(data_type_), size());
       } else {
         // We'll need to use a non-CPU context to perform the copy if
         // one of the context is not CPU since only non-CPU context
@@ -435,20 +436,20 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         if (src.device_type() != ::at::DeviceType::CPU || device_type() == ::at::DeviceType::CPU) {
           if (!context) {
             src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), device_type());
+                nbytes(), src.data(), raw_mutable_data(data_type_), device_type());
           } else {
             CAFFE_ENFORCE(
                 context->device_type() == src.device_type(),
                 "Type for provided context does not match the type of source");
             context->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), device_type());
+                nbytes(), src.data(), raw_mutable_data(data_type_), device_type());
           }
         } else {
           // In case source context is CPU, and target context is non-CPU
           // We'll have to create a Context from target and perform the
           // copy using that context
           CreateContext()->CopyBytesFromCPU(
-              nbytes(), src.raw_data(), raw_mutable_data());
+              nbytes(), src.data(), raw_mutable_data(data_type_));
         }
       }
     }
@@ -681,15 +682,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
-  /**
-   * Returns a const raw void* pointer of the underlying storage. mutable_data()
-   * or raw_mutable_data() must have been called prior to this function call.
-   */
-  inline const void* raw_data() const {
-    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
-    return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * storage_.itemsize());
-  }
-
   /**
    * Returns a mutable raw pointer of the underlying storage. Since we will need
    * to know the type of the data for allocation, a TypeMeta object is passed in
@@ -771,23 +763,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
-  /**
-   * Returns a mutable raw pointer of the underlying storage. This can only be
-   * used when you know for sure that the underlying storage of the tensor is
-   * already created via an earlier raw_mutable_data(meta) call or a
-   * mutable_data<T>() call.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data() {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type_.id() != caffe2::TypeIdentifier::uninitialized(),
-        "Calling raw_mutable_data() without meta, but the current meta is "
-        "of unknown type.");
-    return raw_mutable_data(data_type_);
-  }
-
   /**
    * Returns a typed pointer of the underlying storage.
    *
@@ -920,7 +895,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   void ExtractDeviceOption(caffe2::DeviceOption* device) const {
     auto* context = GetStaticContext();
     CHECK(context);
-    context->ExtractDeviceOption(device, raw_data());
+    context->ExtractDeviceOption(device, data());
   }
 
  protected:
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index f84efb8b1dbbcd..8e670858e89284 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -272,8 +272,12 @@ class CAFFE2_API Tensor final {
     impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity);
   }
 
+  /**
+   * Returns a const raw void* pointer of the underlying storage. mutable_data()
+   * or raw_mutable_data() must have been called prior to this function call.
+   */
   inline const void* raw_data() const {
-    return impl_.get()->raw_data();
+    return impl_->data();
   }
 
   template <typename T>
@@ -285,8 +289,22 @@ class CAFFE2_API Tensor final {
     return impl_.get()->raw_mutable_data(meta);
   }
 
+  /**
+   * Returns a mutable raw pointer of the underlying storage. This can only be
+   * used when you know for sure that the underlying storage of the tensor is
+   * already created via an earlier raw_mutable_data(meta) call or a
+   * mutable_data<T>() call.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
   inline void* raw_mutable_data() const {
-    return impl_.get()->raw_mutable_data();
+    const auto& data_type = impl_->dtype();
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        "Calling raw_mutable_data() without meta, but the current meta is "
+        "of unknown type.");
+    return raw_mutable_data(data_type);
   }
 
   template <typename T>

From a5818047c4c297de7ccb41c32fb98609ab3eaab7 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@mit.edu>
Date: Thu, 27 Sep 2018 19:01:29 -0700
Subject: [PATCH 32/82] Rewrite serialization to correctly handle partial
 reads/writes in all cases (#12143)

Summary:
Previously, doRead/doWrite were functions that could return partial reads/writes,
and we checked for this case inconsistently in the call sites of serialization.cpp.
Now, these functions do NOT return the amount of bytes read/written, and instead
handle the necessary checking loop themselves.

Fixes #12042. Maybe.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12143

Differential Revision: D10097027

Pulled By: ezyang

fbshipit-source-id: fd222ab8a825bed352153648ad396acfe124a3e1
---
 torch/csrc/generic/serialization.cpp |  46 ++-------
 torch/csrc/serialization.cpp         | 143 +++++++++++++++++++--------
 torch/csrc/serialization.h           |   4 +-
 3 files changed, 112 insertions(+), 81 deletions(-)

diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp
index 2299cce245a16b..1e4e7bf7b9e37f 100644
--- a/torch/csrc/generic/serialization.cpp
+++ b/torch/csrc/generic/serialization.cpp
@@ -2,8 +2,6 @@
 #define TH_GENERIC_FILE "generic/serialization.cpp"
 #else
 
-#define SYSCHECK(call) { ssize_t __result = call; if (__result < 0) throw std::system_error((int) __result, std::system_category()); }
-
 template <class io>
 void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
 {
@@ -16,23 +14,10 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
   data = (scalar_t*)cpu_data.get();
   THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(scalar_t), cudaMemcpyDeviceToHost));
 #endif
-  ssize_t result = doWrite(fd, &size, sizeof(int64_t));
-  if (result != sizeof(int64_t))
-    throw std::system_error(result, std::system_category());
+  doWrite(fd, &size, sizeof(int64_t));
   // fast track for bytes and little endian
   if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    char *bytes = (char *) data;
-    int64_t remaining = sizeof(scalar_t) * size;
-    while (remaining > 0) {
-      // we write and read in 1GB blocks to avoid bugs on some OSes
-      ssize_t result = doWrite(fd, bytes, THMin(remaining, 1073741824));
-      if (result < 0)
-        throw std::system_error(result, std::system_category());
-      bytes += result;
-      remaining -= result;
-    }
-    if (remaining != 0)
-      throw std::system_error(result, std::system_category());
+    doWrite(fd, data, sizeof(scalar_t) * size);
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
     std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(scalar_t)]);
@@ -54,7 +39,7 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
             THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       }
-      SYSCHECK(doWrite(fd, le_buffer.get(), to_convert * sizeof(scalar_t)));
+      doWrite(fd, le_buffer.get(), to_convert * sizeof(scalar_t));
     }
   }
 }
@@ -67,11 +52,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 {
   scalar_t *data;
   int64_t size;
-  ssize_t result = doRead(file, &size, sizeof(int64_t));
-  if (result == 0)
-    throw std::runtime_error("unexpected EOF. The file might be corrupted.");
-  if (result != sizeof(int64_t))
-    throw std::system_error(result, std::system_category());
+  doRead(file, &size, sizeof(int64_t));
   THWStoragePtr storage;
   if (_storage == nullptr) {
     storage = THWStorage_(newWithSize)(LIBRARY_STATE size);
@@ -91,20 +72,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 
   // fast track for bytes and little endian
   if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    char *bytes = (char *) data;
-    int64_t remaining = sizeof(scalar_t) * THWStorage_(size)(LIBRARY_STATE storage);
-    while (remaining > 0) {
-      // we write and read in 1GB blocks to avoid bugs on some OSes
-      ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824));
-      if (result == 0) // 0 means EOF, which is also an error
-        throw std::runtime_error("unexpected EOF. The file might be corrupted.");
-      if (result < 0)
-        throw std::system_error(result, std::system_category());
-      bytes += result;
-      remaining -= result;
-    }
-    if (remaining != 0)
-      throw std::system_error(result, std::system_category());
+    doRead(file, data, sizeof(scalar_t) * THWStorage_(size)(LIBRARY_STATE storage));
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
     std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(scalar_t)]);
@@ -112,7 +80,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 
     for (int64_t i = 0; i < size; i += buffer_size) {
       size_t to_convert = std::min(size - i, buffer_size);
-      SYSCHECK(doRead(file, le_buffer.get(), sizeof(scalar_t) * to_convert));
+      doRead(file, le_buffer.get(), sizeof(scalar_t) * to_convert);
 
       if (sizeof(scalar_t) == 2) {
         THP_decodeInt16Buffer((int16_t*)data + i,
@@ -142,6 +110,4 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 template THWStorage* THPStorage_(readFileRaw<int>)(int fd, THWStorage* storage);
 template THWStorage* THPStorage_(readFileRaw<PyObject*>)(PyObject* fd, THWStorage* storage);
 
-#undef SYSCHECK
-
 #endif
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index eaf93b92be14bb..de98d278d11a10 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -4,34 +4,41 @@
 #include "THP.h"
 #include "serialization.h"
 
-static ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes);
-static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes);
-static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes);
+template <class io>
+ssize_t doPartialRead(io fildes, void* buf, size_t nbytes);
+
+template <class io>
+ssize_t doPartialWrite(io fildes, void* buf, size_t nbytes);
+
+static ssize_t doPartialPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes);
+static ssize_t doPartialPythonReadInto(PyObject* fildes, void* buf, size_t nbytes);
+static ssize_t doPartialPythonWrite(PyObject* fildes, void* buf, size_t nbytes);
 
 template <>
-ssize_t doRead<int>(int fildes, void* buf, size_t nbytes) {
+ssize_t doPartialRead<int>(int fildes, void* buf, size_t nbytes) {
   return read(fildes, buf, nbytes);
 }
 
 template <>
-ssize_t doRead<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
+ssize_t doPartialRead<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
   // Try to use fildes.readinto() instead of fildes.read()
   // because it is more memory efficient.
+  // TODO: Stop calling PyObject_HasAttrString() in a loop on our read loop
   auto has_readinto = PyObject_HasAttrString(fildes, "readinto") == 1;
   if (has_readinto) {
-    return doPythonReadInto(fildes, buf, nbytes);
+    return doPartialPythonReadInto(fildes, buf, nbytes);
   }
-  return doPythonReadBuffered(fildes, buf, nbytes);
+  return doPartialPythonReadBuffered(fildes, buf, nbytes);
 }
 
 template <>
-ssize_t doWrite<int>(int fildes, void* buf, size_t nbytes) {
+ssize_t doPartialWrite<int>(int fildes, void* buf, size_t nbytes) {
   return write(fildes, buf, nbytes);
 }
 
 template <>
-ssize_t doWrite<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
-  return doPythonWrite(fildes, buf, nbytes);
+ssize_t doPartialWrite<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPartialPythonWrite(fildes, buf, nbytes);
 }
 
 static inline bool isUnsupportedOperation() {
@@ -43,39 +50,39 @@ static inline bool isUnsupportedOperation() {
 }
 
 // Call Python fildes.read(nbytes) and copy it to buf.
-static inline ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes) {
-  const size_t buffer_size = 262144;  // 2^18
-  size_t read_bytes = 0;
-
-  while (read_bytes < nbytes) {
-    auto remaining = nbytes - read_bytes;
-    auto to_read = remaining > buffer_size ? buffer_size : remaining;
-    THPObjectPtr r(PyObject_CallMethod(fildes, "read", "i", to_read));
-    if (!r) throw python_error();
-
-    // read output is String (Python 2) / Bytes (Python 3)
+static inline ssize_t doPartialPythonReadBuffered(PyObject* fildes, void* buf, size_t raw_nbytes) {
+  // If we request a large amount of data, f.read() will internally try to
+  // allocate a buffer of that size.  This is counterproductive, because
+  // it's not the buffer we ultimately want to write the data into.  Read
+  // less than that and avoid allocating too much extra memory.
+  // TODO: Maybe 260 KB is a bit small...
+  const size_t nbytes = std::min<size_t>(raw_nbytes, 262144u); // 2^18 (~260 KB)
+
+  THPObjectPtr r(PyObject_CallMethod(fildes, "read", "i", nbytes));
+  if (!r) throw python_error();
+
+  // read output is String (Python 2) / Bytes (Python 3)
 #if PY_MAJOR_VERSION >= 3
-    auto size = PyBytes_GET_SIZE(r.get());
-    const void* bytes = PyBytes_AsString(r.get());
+  auto size = PyBytes_GET_SIZE(r.get());
+  const void* py_buf = PyBytes_AsString(r.get());
 #else
-    auto size = PyString_GET_SIZE(r.get());
-    const void* bytes = PyString_AsString(r.get());
+  auto size = PyString_GET_SIZE(r.get());
+  const void* py_buf = PyString_AsString(r.get());
 #endif
 
-    // we read EOF
-    if (size == 0) {
-      return read_bytes;
-    }
+  // we read EOF
+  if (size == 0) {
+    return 0;
+  }
 
-    memcpy(reinterpret_cast<char*>(buf) + read_bytes, bytes, size);
-    read_bytes += size;
-  } // Reading loop
+  // Slurp it into the buffer we actually want
+  memcpy(buf, py_buf, size);
 
-  return read_bytes;
+  return size;
 }
 
 // Either does fildes.readinto(buf) or fildes.write(buf)
-static inline ssize_t doPythonIO(PyObject* fildes, void* buf, size_t nbytes, bool is_read) {
+static inline ssize_t doPartialPythonIO(PyObject* fildes, void* buf, size_t nbytes, bool is_read) {
 #if PY_MAJOR_VERSION >= 3
   auto rw_flag = is_read ? PyBUF_WRITE : PyBUF_READ;
   THPObjectPtr memview(PyMemoryView_FromMemory(
@@ -97,19 +104,77 @@ static inline ssize_t doPythonIO(PyObject* fildes, void* buf, size_t nbytes, boo
   // fildes.readinto can return UnsupportedOperation so fall back to fildes.read.
   if (is_read && isUnsupportedOperation()) {
     PyErr_Clear();
-    return doPythonReadBuffered(fildes, buf, nbytes);
+    return doPartialPythonReadBuffered(fildes, buf, nbytes);
   }
   throw python_error();
 }
 
 // Call Python fildes.readinto(buf)
-static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes) {
-  return doPythonIO(fildes, buf, nbytes, /* is_read */ true);
+static ssize_t doPartialPythonReadInto(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPartialPythonIO(fildes, buf, nbytes, /* is_read */ true);
 }
 
 // Call Python fildes.write(buf)
-static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes) {
-  return doPythonIO(fildes, buf, nbytes, /* is_read */ false);
+static ssize_t doPartialPythonWrite(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPartialPythonIO(fildes, buf, nbytes, /* is_read */ false);
+}
+
+// Requires that we read EXACTLY nbytes; fails if we don't.
+template <typename io>
+void doRead(io fildes, void* raw_buf, size_t nbytes) {
+  char* buf = static_cast<char*>(raw_buf);
+  while (nbytes > 0) {
+    errno = 0; // doPartialRead may not set errno
+    // we read in 1GB blocks to avoid bugs on Mac OS X Lion
+    // see https://github.com/pytorch/pytorch/issues/1031 for more details
+    ssize_t r = doPartialRead(fildes, buf, std::min<size_t>(nbytes, 1073741824));
+    if (r < 0) {
+      int err = errno;
+      AT_ASSERTM(err != 0, "read(): impossible! r < 0, but no errno was set");
+      AT_ASSERTM(err != EAGAIN, "read(): non-blocking fd ", fildes,
+                                " read EAGAIN; cowardly refusing to spin-wait");
+      if (err == EINTR) {
+        continue;
+      } else {
+        AT_ERROR("read(): fd ", fildes, " failed with ", strerror(err));
+      }
+    } else if (r == 0) {
+      break;
+    }
+    buf += r;
+    // This is guaranteed by POSIX, but I just want to be double-sure
+    // to not underflow a signed integer.
+    AT_ASSERT(static_cast<size_t>(r) <= nbytes);
+    nbytes -= r;
+  }
+  if (nbytes != 0) {
+    AT_ERROR("unexpected EOF, expected ", nbytes, " more bytes. The file might be corrupted.");
+  }
+}
+
+template <typename io>
+void doWrite(io fildes, void* raw_buf, size_t nbytes) {
+  char* buf = static_cast<char*>(raw_buf);
+  while (nbytes > 0) {
+    errno = 0; // doPartialWrite may not set errno
+    // we write in 1GB blocks to avoid bugs on Mac OS X Lion
+    // see https://github.com/pytorch/pytorch/issues/1031 for more details
+    ssize_t r = doPartialWrite(fildes, buf, std::min<size_t>(nbytes, 1073741824));
+    if (r < 0) {
+      int err = errno;
+      AT_ASSERTM(err != 0, "write(): impossible! r < 0, but no errno was set");
+      AT_ASSERTM(err != EAGAIN, "write(): non-blocking fd ", fildes,
+                                " read EAGAIN; cowardly refusing to spin-wait");
+      if (err == EINTR) {
+        continue;
+      } else {
+        AT_ERROR("write(): fd ", fildes, " failed with ", strerror(err));
+      }
+    }
+    buf += r;
+    AT_ASSERT(static_cast<size_t>(r) <= nbytes);
+    nbytes -= r;
+  }
 }
 
 #include "generic/serialization.cpp"
diff --git a/torch/csrc/serialization.h b/torch/csrc/serialization.h
index 410619a68422c5..df811052fe7cda 100644
--- a/torch/csrc/serialization.h
+++ b/torch/csrc/serialization.h
@@ -8,9 +8,9 @@
 #include <TH/THGenerateHalfType.h>
 
 template <class io>
-ssize_t doRead(io fildes, void* buf, size_t nbytes);
+void doRead(io fildes, void* buf, size_t nbytes);
 
 template <class io>
-ssize_t doWrite(io fildes, void* buf, size_t nbytes);
+void doWrite(io fildes, void* buf, size_t nbytes);
 
 #endif

From 7f35e92af293c9232f974b119d25bbebb830d9d6 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Thu, 27 Sep 2018 19:12:01 -0700
Subject: [PATCH 33/82] mutable lists (#10700)

Summary:
This PR implements the design that we discussed. Changes:
- Added a World token IValue and type. The IValue is basically a dummy struct for now, in the future we may extend it (say, add thread-local state).
- Effectful ops explicitly declare they are mutable by having World tokens as inputs and outputs in their schema.
- Purely functional ops that use mutable values will get "fenced" and the world token will be threaded through the fences
- AnnotateEffects pass which wires up all the world tokens together.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10700

Reviewed By: eellison

Differential Revision: D9547881

Pulled By: michaelsuo

fbshipit-source-id: ebbd786c31f15bf45e2ddb0c188438ff2f5f3c88
---
 aten/src/ATen/core/ivalue.cpp                 |   9 +-
 aten/src/ATen/core/ivalue.h                   |  61 +++-
 test/expect/TestBatched.test_for.expect       |  14 +-
 test/expect/TestBatched.test_while.expect     |  60 ++--
 ....test_call_script_fn_from_script_fn.expect |   6 +-
 ...test_call_script_mod_from_script_fn.expect |  20 +-
 ..._call_script_mod_from_script_module.expect |   6 +-
 test/test_jit.py                              |  94 ++++-
 torch/CMakeLists.txt                          |   1 +
 torch/csrc/jit/constants.cpp                  |   2 +
 torch/csrc/jit/export.cpp                     |   2 +
 torch/csrc/jit/function_schema.h              |  28 +-
 torch/csrc/jit/graph_executor.cpp             |   1 +
 torch/csrc/jit/import.cpp                     |   2 +
 torch/csrc/jit/interned_strings.h             |   5 +
 torch/csrc/jit/interpreter.cpp                |  19 +-
 torch/csrc/jit/ir.cpp                         |  21 +-
 torch/csrc/jit/ir.h                           |  53 +--
 torch/csrc/jit/operator.cpp                   |   1 +
 torch/csrc/jit/passes/annotate_effects.cpp    | 320 ++++++++++++++++++
 torch/csrc/jit/passes/annotate_effects.h      |  11 +
 .../csrc/jit/passes/constant_propagation.cpp  |   2 +
 .../csrc/jit/passes/dead_code_elimination.cpp |  13 +-
 torch/csrc/jit/passes/to_batch.cpp            |   3 +-
 torch/csrc/jit/pybind_utils.h                 |   4 +-
 torch/csrc/jit/python_ir.cpp                  |   2 +
 torch/csrc/jit/register_prim_ops.cpp          |  51 ++-
 torch/csrc/jit/script/compiler.cpp            | 269 ++++++++-------
 torch/csrc/jit/type.cpp                       |   6 +
 torch/csrc/jit/type.h                         |  29 +-
 30 files changed, 866 insertions(+), 249 deletions(-)
 create mode 100644 torch/csrc/jit/passes/annotate_effects.cpp
 create mode 100644 torch/csrc/jit/passes/annotate_effects.h

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 8077f935ae8242..5df0a5b49ca93b 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -12,7 +12,8 @@
   _(String) \
   _(TensorList) \
   _(Blob) \
-  _(GenericList)
+  _(GenericList) \
+  _(World) \
 
 namespace torch { namespace jit {
 
@@ -24,7 +25,7 @@ CAFFE2_API c10::intrusive_ptr<ConstantString> ConstantString::create(
 namespace {
 
 template<typename Elem>
-std::ostream& printList(std::ostream & out, const ConstantList<Elem> &v,
+std::ostream& printList(std::ostream & out, const List<Elem> &v,
   const std::string start, const std::string delim, const std::string finish) {
   out << start;
   for(size_t i = 0; i < v.elements().size(); ++i) {
@@ -48,13 +49,13 @@ std::ostream& operator<<(std::ostream & out, const ConstantString & v) {
 }
 
 template<typename Elem>
-std::ostream& operator<<(std::ostream & out, const ConstantList<Elem> & v) {
+std::ostream& operator<<(std::ostream & out, const List<Elem> & v) {
   return printList<Elem>(out, v, "[", ", ", "]");
 }
 
 // tuple case
 template<>
-std::ostream& operator<<(std::ostream & out, const ConstantList<IValue> & v) {
+std::ostream& operator<<(std::ostream & out, const List<IValue> & v) {
   return printList<IValue>(out, v, "(", ", ", ")");
 }
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 5613dc42357fa9..5e210d638d9226 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -33,17 +33,17 @@ struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
       const ConstantString& v);
 };
 
-// non-mutable list
 template <typename Elem>
-struct C10_EXPORT ConstantList : c10::intrusive_ptr_target {
+struct C10_EXPORT List : c10::intrusive_ptr_target {
  private:
-  const std::vector<Elem> elements_;
+  std::vector<Elem> elements_;
+
  public:
   typedef Elem ElemType;
-  ConstantList(std::vector<Elem> elements_)
-  : elements_(std::move(elements_)) {}
-  static c10::intrusive_ptr<ConstantList<Elem>> create(std::vector<Elem> elements_) {
-    return c10::make_intrusive<ConstantList<Elem>>(std::move(elements_));
+
+  List(std::vector<Elem> elements_) : elements_(std::move(elements_)) {}
+  static c10::intrusive_ptr<List<Elem>> create(std::vector<Elem> elements_) {
+    return c10::make_intrusive<List<Elem>>(std::move(elements_));
   }
   const std::vector<Elem>& elements() const {
     return elements_;
@@ -51,19 +51,30 @@ struct C10_EXPORT ConstantList : c10::intrusive_ptr_target {
   operator const std::vector<Elem>&() const {
     return elements();
   }
+
+  std::vector<Elem>& elements() {
+    return elements_;
+  }
+  operator std::vector<Elem>&() {
+    return elements();
+  }
+};
+
+struct World {
+  int64_t world_id;
 };
 
 struct IValue;
-struct C10_EXPORT Tuple : public ConstantList<IValue> {
-  using ConstantList<IValue>::ConstantList;
+struct C10_EXPORT Tuple : public List<IValue> {
+  using List<IValue>::List;
   static c10::intrusive_ptr<Tuple> create(std::vector<IValue> elements_) {
     return c10::make_intrusive<Tuple>(std::move(elements_));
   }
 };
-using IntList = ConstantList<int64_t>;
-using TensorList = ConstantList<at::Tensor>;
-using DoubleList = ConstantList<double>;
-using GenericList = ConstantList<IValue>;
+using IntList = List<int64_t>;
+using TensorList = List<at::Tensor>;
+using DoubleList = List<double>;
+using GenericList = List<IValue>;
 
 // IValue is the generic tagged union used by the interpreter to hold
 // all value types.
@@ -83,7 +94,8 @@ using GenericList = ConstantList<IValue>;
   _(String) \
   _(TensorList) \
   _(Blob) \
-  _(GenericList)
+  _(GenericList) \
+  _(World) \
 
 struct CAFFE2_API IValue final {
   IValue()
@@ -143,6 +155,13 @@ struct CAFFE2_API IValue final {
     return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
   }
 
+  const IValue& toIValue() const {
+    return *this;
+  }
+  IValue& toIValue() {
+    return *this;
+  }
+
   IValue(caffe2::Blob blob) : tag(Tag::Blob), is_intrusive_ptr(true) {
     // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
     // and
@@ -185,6 +204,17 @@ struct CAFFE2_API IValue final {
     return payload.as_double;
   }
 
+  // World
+  IValue(World w)
+  : tag(Tag::World), is_intrusive_ptr(false) {
+    payload.as_world = w;
+  }
+  bool isWorld() const { return Tag::World == tag; }
+  World toWorld() const {
+    AT_ASSERT(isWorld());
+    return payload.as_world;
+  }
+
   // Int
   IValue(int64_t i)
   : tag(Tag::Int), is_intrusive_ptr(false) {
@@ -367,6 +397,7 @@ struct CAFFE2_API IValue final {
     int64_t as_int;
     double as_double;
     c10::intrusive_ptr_target* as_intrusive_ptr;
+    World as_world;
   } payload;
   Tag tag;
   bool is_intrusive_ptr;
@@ -399,6 +430,8 @@ DEFINE_TO(std::vector<int64_t>, toIntListRef)
 DEFINE_TO(std::vector<double>, toDoubleListRef)
 DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
 DEFINE_TO(std::vector<IValue>, toGenericListRef)
+DEFINE_TO(World, toWorld)
+DEFINE_TO(IValue, toIValue)
 
 #undef DEFINE_TO
 
diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect
index bcbcffaee486a3..8932957402c94e 100644
--- a/test/expect/TestBatched.test_for.expect
+++ b/test/expect/TestBatched.test_for.expect
@@ -6,17 +6,17 @@ graph(%x.1_data : Dynamic
       %y_dims : Dynamic) {
   %6 : int = prim::Constant[value=10]()
   %7 : int = prim::Constant[value=1]()
-  %x : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims)
+  %x : Dynamic, %9 : Dynamic, %10 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims)
     block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) {
-      %13 : int = prim::Constant[value=1]()
-      %14 : Long() = prim::NumToTensor(%13)
-      %alpha : float = prim::TensorToNum(%14)
+      %15 : int = prim::Constant[value=1]()
+      %16 : Long() = prim::NumToTensor(%15)
+      %alpha : float = prim::TensorToNum(%16)
       %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha)
       %mask : Dynamic = aten::mul(%5_mask, %y_mask)
       %dims : Dynamic = aten::__or__(%5_dims, %y_dims)
-      %19 : int = prim::Constant[value=1]()
+      %21 : int = prim::Constant[value=1]()
       %data : Dynamic = aten::where(%mask, %data.1, %5_data)
-      -> (%19, %data, %mask, %dims)
+      -> (%21, %data, %mask, %dims)
     }
-  return (%x, %21, %22);
+  return (%x, %9, %10);
 }
diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect
index 66e3cdb6a2dfa8..7aba7a89ace320 100644
--- a/test/expect/TestBatched.test_while.expect
+++ b/test/expect/TestBatched.test_while.expect
@@ -14,34 +14,34 @@ graph(%a.1_data : Dynamic
   %13 : Dynamic = aten::sum(%12)
   %14 : Dynamic = aten::gt(%13, %11)
   %15 : int = prim::TensorToNum(%14)
-  %63 : Dynamic, %64 : Dynamic, %65 : Dynamic, %a : Dynamic, %61 : Dynamic, %62 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
+  %16 : Dynamic, %17 : Dynamic, %18 : Dynamic, %a : Dynamic, %20 : Dynamic, %21 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
     block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) {
-      %24 : int = prim::Constant[value=1]()
-      %25 : Long() = prim::NumToTensor(%24)
-      %alpha : float = prim::TensorToNum(%25)
+      %29 : int = prim::Constant[value=1]()
+      %30 : Long() = prim::NumToTensor(%29)
+      %alpha : float = prim::TensorToNum(%30)
       %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha)
       %mask : Dynamic = aten::mul(%6_mask, %b_mask)
       %dims : Dynamic = aten::__or__(%6_dims, %b_dims)
-      %30 : Dynamic = aten::gt(%data.1, %b_data)
-      %31 : Dynamic = aten::mul(%mask, %b_mask)
-      %32 : Dynamic = aten::__or__(%dims, %b_dims)
-      %33 : int = prim::TensorToNum(%30)
-      %34 : int = prim::Constant[value=1]()
-      %35 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
-      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %35)
-      %37 : int = aten::dim(%cond_mask.1)
-      %38 : int = aten::eq(%37, %34)
-      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%38)
+      %35 : Dynamic = aten::gt(%data.1, %b_data)
+      %36 : Dynamic = aten::mul(%mask, %b_mask)
+      %37 : Dynamic = aten::__or__(%dims, %b_dims)
+      %38 : int = prim::TensorToNum(%35)
+      %39 : int = prim::Constant[value=1]()
+      %40 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
+      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %40)
+      %42 : int = aten::dim(%cond_mask.1)
+      %43 : int = aten::eq(%42, %39)
+      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%43)
         block0() {
-          %42 : int = aten::dim(%data.1)
-          %43 : int = aten::sub(%42, %34)
-          %44 : int = prim::Constant[value=1]()
-          %data.3 : Dynamic = prim::Loop(%43, %44, %cond_mask.1)
-            block0(%_ : int, %47 : Dynamic) {
-              %48 : int = aten::dim(%47)
-              %data.2 : Dynamic = aten::unsqueeze(%47, %48)
-              %50 : int = prim::Constant[value=1]()
-              -> (%50, %data.2)
+          %47 : int = aten::dim(%data.1)
+          %48 : int = aten::sub(%47, %39)
+          %49 : int = prim::Constant[value=1]()
+          %data.3 : Dynamic = prim::Loop(%48, %49, %cond_mask.1)
+            block0(%_ : int, %52 : Dynamic) {
+              %53 : int = aten::dim(%52)
+              %data.2 : Dynamic = aten::unsqueeze(%52, %53)
+              %55 : int = prim::Constant[value=1]()
+              -> (%55, %data.2)
             }
           %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
           %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
@@ -53,12 +53,12 @@ graph(%a.1_data : Dynamic
       %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data)
       %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask)
       %res_dims : Dynamic = aten::__or__(%dims, %6_dims)
-      %56 : int = prim::Constant[value=0]()
-      %57 : Dynamic = aten::mul(%30, %31)
-      %58 : Dynamic = aten::sum(%57)
-      %59 : Dynamic = aten::gt(%58, %56)
-      %60 : int = prim::TensorToNum(%59)
-      -> (%60, %30, %31, %32, %res_data, %res_mask, %res_dims)
+      %61 : int = prim::Constant[value=0]()
+      %62 : Dynamic = aten::mul(%35, %36)
+      %63 : Dynamic = aten::sum(%62)
+      %64 : Dynamic = aten::gt(%63, %61)
+      %65 : int = prim::TensorToNum(%64)
+      -> (%65, %35, %36, %37, %res_data, %res_mask, %res_dims)
     }
-  return (%a, %61, %62);
+  return (%a, %20, %21);
 }
diff --git a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
index bed05b89580c6c..b2159144f798fc 100644
--- a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
@@ -1,7 +1,7 @@
 graph(%x : Dynamic) {
-  %2 : int = prim::Constant[value=1]()
-  %1 : Dynamic = aten::neg(%x)
+  %1 : int = prim::Constant[value=1]()
+  %2 : Dynamic = aten::neg(%x)
   %3 : int = prim::Constant[value=1]()
-  %4 : Dynamic = aten::add(%1, %2, %3)
+  %4 : Dynamic = aten::add(%2, %1, %3)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
index b7492626b5d8fe..3478376f829c85 100644
--- a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
@@ -1,14 +1,14 @@
 graph(%x : Dynamic) {
-  %9 : int = prim::Constant[value=1]()
-  %1 : int = prim::Constant[value=3]()
-  %2 : int = prim::Constant[value=4]()
-  %3 : int[] = prim::ListConstruct(%2, %1)
-  %4 : int = prim::Constant[value=6]()
-  %5 : int = prim::Constant[value=0]()
-  %6 : int[] = prim::Constant[value=[0, -1]]()
-  %7 : Dynamic = aten::zeros(%3, %4, %5, %6)
-  %8 : Dynamic = aten::mm(%x, %7)
+  %1 : int = prim::Constant[value=1]()
+  %2 : int = prim::Constant[value=3]()
+  %3 : int = prim::Constant[value=4]()
+  %4 : int[] = prim::ListConstruct(%3, %2)
+  %5 : int = prim::Constant[value=6]()
+  %6 : int = prim::Constant[value=0]()
+  %7 : int[] = prim::Constant[value=[0, -1]]()
+  %8 : Dynamic = aten::zeros(%4, %5, %6, %7)
+  %9 : Dynamic = aten::mm(%x, %8)
   %10 : int = prim::Constant[value=1]()
-  %11 : Dynamic = aten::add(%8, %9, %10)
+  %11 : Dynamic = aten::add(%9, %1, %10)
   return (%11);
 }
diff --git a/test/expect/TestScript.test_call_script_mod_from_script_module.expect b/test/expect/TestScript.test_call_script_mod_from_script_module.expect
index 5cae9dcdf96e9d..0365ff600b0a20 100644
--- a/test/expect/TestScript.test_call_script_mod_from_script_module.expect
+++ b/test/expect/TestScript.test_call_script_mod_from_script_module.expect
@@ -1,7 +1,7 @@
 graph(%x : Dynamic
       %1 : Dynamic
-      %3 : Dynamic) {
-  %2 : Dynamic = aten::mm(%x, %1)
-  %4 : Dynamic = aten::mm(%2, %3)
+      %2 : Dynamic) {
+  %3 : Dynamic = aten::mm(%x, %1)
+  %4 : Dynamic = aten::mm(%3, %2)
   return (%4);
 }
diff --git a/test/test_jit.py b/test/test_jit.py
index 84313a07dff364..85c6ce991292da 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -2230,7 +2230,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_if_else_with_scalar(self):
         def single_if(a, b):
@@ -2250,7 +2250,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_if_noelse(self):
         def single_if(a, b):
@@ -2268,7 +2268,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_if_noelse_with_scalar(self):
         def single_if(a, b):
@@ -2286,7 +2286,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_while(self):
         def single_while(a, b):
@@ -2305,7 +2305,7 @@ def single_while(a, b):
 
         script_while = torch.jit.script(single_while)
         graph = torch.to_batch_graph(script_while.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_for(self):
         def single_for(x, y):
@@ -2323,7 +2323,7 @@ def single_for(x, y):
 
         script_for = torch.jit.script(single_for)
         graph = torch.to_batch_graph(script_for.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_lstm(self):
         def LSTM(x_all, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
@@ -3453,6 +3453,80 @@ def test_over_slice():
             return a[3:10] == [3, 4]
         self.checkScript(test_backward_slice, ())
 
+    def test_mutable_list(self):
+        def test_append():
+            a = [0, 1]
+            a.append(2)
+            a.append(3)
+            return a == [0, 1, 2, 3]
+        self.checkScript(test_append, ())
+
+        def test_append_2():
+            a = [0, 1]
+            a.append(2)
+            a = [1]
+            a.append(4)
+            return a == [1, 4]
+        self.checkScript(test_append_2, ())
+
+        def test_append_if():
+            a = [1]
+            if True:
+                a.append(4)
+            return a == [1, 4]
+        self.checkScript(test_append_if, ())
+
+        def test_append_if_else():
+            a = [1]
+            if False:
+                a.append(4)
+            else:
+                a.append(10)
+            return a == [1, 10]
+        self.checkScript(test_append_if_else, ())
+
+        def test_append_loop():
+            a = _construct_empty_int_list()
+            for i in range(5):
+                a.append(i)
+
+            return a == [0, 1, 2, 3, 4]
+        self.checkScript(test_append_loop, ())
+
+        def test_append_loop_if():
+            a = _construct_empty_int_list()
+            for i in range(5):
+                if i > 3:
+                    a.append(i)
+                else:
+                    a.append(0)
+
+            return a == [0, 0, 0, 0, 4]
+        self.checkScript(test_append_loop_if, ())
+
+        def test_nested_loop():
+            a = _construct_empty_int_list()
+            for i in range(2):
+                for j in range(2):
+                    a.append(i + j)
+
+            return a == [0, 1, 1, 2]
+        self.checkScript(test_append_loop_if, ())
+
+    def test_mutable_list_function_inline(self):
+        @torch.jit.script
+        def bar(y):
+            # type: (List[int]) -> List[int]
+            y.append(4)
+
+        @torch.jit.script
+        def foo():
+            x = [1, 2, 3]
+            bar(x)
+            return x
+
+        self.assertEqual(foo(), [1, 2, 3, 4])
+
     def test_func_call(self):
         script = '''
         def add(a, b):
@@ -6496,7 +6570,7 @@ def script_fn(x):
 
         # Note: the neg op from script_fn1 should be properly inlined into the
         # graph of script_fn
-        self.assertExpected(str(script_fn.graph))
+        self.assertExpected(canonical(script_fn.graph))
 
     def test_call_script_mod_from_script_fn(self):
         class ScriptMod(torch.jit.ScriptModule):
@@ -6513,7 +6587,7 @@ def forward(self, x):
         def script_fn(x):
             return sm(x) + 1
 
-        self.assertExpected(str(script_fn.graph))
+        self.assertExpected(canonical(script_fn.graph))
 
     def test_call_python_fn_from_script_module(self):
         def python_fn(x):
@@ -6612,7 +6686,7 @@ def forward(self, x):
                 return script_fn(torch.mm(x, self.param))
 
         sm = ScriptMod()
-        self.assertExpected(str(sm.__getattr__('forward').graph))
+        self.assertExpected(canonical(sm.__getattr__('forward').graph))
 
     def test_call_script_mod_from_script_module(self):
         class ScriptMod1(torch.jit.ScriptModule):
@@ -6638,7 +6712,7 @@ def forward(self, x):
         # Note: the parameters from both modules should appear in the flattened
         # input list to the graph. The mm op from ScriptMod1 should be properly
         # inlined
-        self.assertExpected(str(sm.graph))
+        self.assertExpected(canonical(sm.graph))
 
     def test_module_with_params_called_fails(self):
         with self.assertRaisesRegex(RuntimeError, "Attempted to inline a Module with parameters. Stateful "
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index ce337e93c85463..e9b275bc7d2cab 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -149,6 +149,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/annotate_effects.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index f1844d2bac6651..1633ac0de4d6aa 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -42,6 +42,8 @@ Value* insertConstant(
     n->destroy();
     n = g.create(prim::None);
     n->output()->setType(NoneType::get());
+  } else if(val.isWorld()) {
+    n->output()->setType(WorldType::get());
   } else {
     throw constant_not_supported_error("Unsupported value kind: " + val.tagKind());
   }
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index c993aa05eb9a04..d9a9e79bc714d2 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -566,6 +566,8 @@ void ModuleEncoder::EncodeTypeInfo(
     type_proto->set_denotation("StringType");
   } else if (kind == TypeKind::VarType) {
     type_proto->set_denotation("TypeVar:" + type->expect<VarType>()->name());
+  } else if (kind == TypeKind::WorldType) {
+    type_proto->set_denotation("WorldType");
   } else {
     throw std::runtime_error("unexpected type kind");
   }
diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h
index c7b53abf46c2a2..dcaaf766e18c0e 100644
--- a/torch/csrc/jit/function_schema.h
+++ b/torch/csrc/jit/function_schema.h
@@ -46,7 +46,10 @@ struct FunctionSchema {
         arguments(std::move(arguments)),
         returns(std::move(returns)),
         is_vararg(is_vararg),
-        is_varret(is_varret) {}
+        is_varret(is_varret),
+        is_mutable(isMutable()) {
+    validate();
+  }
   FunctionSchema(
       Symbol name,
       std::vector<Argument> arguments,
@@ -58,7 +61,9 @@ struct FunctionSchema {
             std::move(std::move(arguments)),
             std::move(std::move(returns)),
             is_vararg,
-            is_varret) {}
+            is_varret) {
+    validate();
+  }
 
   const std::string name;
   const std::vector<Argument> arguments;
@@ -69,6 +74,8 @@ struct FunctionSchema {
   // arguments are not checked by schema
   const bool is_vararg;
   const bool is_varret;
+  const bool is_mutable;
+
   at::optional<int> argumentIndexWithName(const std::string& name) const {
     for(size_t i = 0; i < arguments.size(); ++i) {
       if(name == arguments[i].name)
@@ -76,6 +83,23 @@ struct FunctionSchema {
     }
     return at::nullopt;
   }
+
+ private:
+  bool isMutable() const {
+    return std::any_of(
+        arguments.cbegin(), arguments.cend(), [](const Argument& arg) {
+          return arg.type == WorldType::get();
+        });
+  }
+
+  void validate() const {
+    if (is_mutable) {
+      // Mutable schemas should have a world token as the first argument
+      // and return.
+      JIT_ASSERT(arguments.at(0).type == WorldType::get());
+      JIT_ASSERT(returns.at(0).type == WorldType::get());
+    }
+  }
 };
 
 // for debugging, make sure we can describe the call site
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index d071c464721559..20ee429b3696c8 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -7,6 +7,7 @@
 #include "torch/csrc/jit/interpreter.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/passes/annotate_effects.h"
 #include "torch/csrc/jit/passes/batch_mm.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index c03e3d80e1e06f..212c28e4c10db1 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -260,6 +260,8 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) {
     return NoneType::get();
   } else if (kind == "GeneratorType") {
     return GeneratorType::get();
+  } else if (kind == "WorldType") {
+    return WorldType::get();
   } else if (kind == "StringType") {
     return StringType::get();
   } else if (kind.find("TypeVar:") == 0) {
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index e1d76dde56c59d..b4e6b7c1398f1b 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -59,6 +59,11 @@ namespace torch { namespace jit {
   _(prim, ConstantChunk)           \
   _(prim, NoneGenerator)           \
   _(aten, floordiv)                \
+  _(prim, MemoryFence)             \
+  _(prim, LoadWorld)               \
+  _(prim, StoreWorld)              \
+  _(prim, DummyWorld)              \
+  _(aten, append)                  \
   _(aten, __not__)                 \
   FORALL_ATEN_BASE_SYMBOLS(_)      \
   _(onnx, Add)                     \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 0d2e22307527b6..14e7fab54d9549 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -32,7 +32,7 @@ namespace torch { namespace jit {
 // to what the instructions will look like.
 // In particular we:
 // * (TODO) desugar Loop trip counts into c = 0, c += 1 instructions in the loop
-// * flatten stages so that each stage starts with a load from the stack
+// * flatten stages so that each stage starts with a load to registers
 //   and ends with a store to the stack
 // *. computes move_flags (see Outputs), and inserts
 // *  Drop nodes are inserted for any node that is unused to create a dummy use
@@ -72,8 +72,6 @@ Value* createTripCountConjunctiveCondition(
   return new_cond;
 }
 
-} // namespace
-
 // this currently just _removes_ the trip count inputs and checks they are
 // unused. In the future they will be desugared into normal arithmetic to
 // provide a loop counter
@@ -142,9 +140,9 @@ static std::vector<std::vector<TypePtr>> flattenStages(Graph & graph) {
   auto it = graph.nodes().begin();
   for(size_t i = 0; i <= graph.stage(); i++) {
     stage_input_types.emplace_back();
-    auto store = graph.create(prim::Store, 0)->insertBefore(*it);
+    auto load = graph.create(prim::Load, 0)->insertBefore(*it);
     while(input_pos < graph.inputs().size() && graph.inputs()[input_pos]->stage() == i) {
-      auto nv = store->addOutput();
+      auto nv = load->addOutput();
       auto old_node = graph.inputs()[input_pos];
       nv->setType(old_node->type());
       stage_input_types[i].push_back(old_node->type());
@@ -153,9 +151,9 @@ static std::vector<std::vector<TypePtr>> flattenStages(Graph & graph) {
     }
     while(it != graph.nodes().end() && it->stage() == i)
       ++it;
-    auto load = graph.create(prim::Load, 0)->insertBefore(*it);
+    auto store = graph.create(prim::Store, 0)->insertBefore(*it);
     while(output_pos < graph.outputs().size() && graph.outputs()[output_pos]->stage() == i) {
-      load->addInput(graph.outputs()[output_pos]);
+      store->addInput(graph.outputs()[output_pos]);
       output_pos++;
     }
   }
@@ -307,6 +305,7 @@ std::unordered_map<Node*, std::vector<uint8_t>> findLastUses(Graph & g) {
 
   return FindLastUses(g).move_flags;
 }
+} //namespace
 
 // pre-processing that happens once per graph
 struct PreprocessGraph {
@@ -503,10 +502,10 @@ struct CodeImpl {
           insertInstruction(node);
         } break;
       }
-      // each stage ends with a load instruction
+      // each stage ends with a store instruction
       // we record where these instructions occur, and use them to
       // exit the interpreter
-      if(node->kind() == prim::Load) {
+      if(node->kind() == prim::Store) {
         stage_end.push_back(instructions.size());
       }
     }
@@ -694,7 +693,7 @@ struct InterpreterStateImpl {
           for(int i = inst.outputs.size - 1; i >= 0; i--) {
             int reg = get(inst.outputs,i);
             registers[reg] = pop(stack);
-            // std::cout << "pop reg[" << reg << "];\n" << registers[reg].pImpl << "\n";
+            // std::cout << "pop reg[" << reg << "];\n" << registers[reg] << "\n";
           }
           pc = new_pc;
         } catch(std::exception & e) {
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 82b14fa0b6839d..90451494bacbc7 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -248,18 +248,22 @@ void Node::lint() const {
   }
 
   // Node subclass invariants
-  // - Return uses is zero
-  // - Param inputs is zero
-  // - Select inputs is one
-  // - Python operator cconv is correct
-
   IR_IF(this,Constant)
     JIT_ASSERT(inputs_.size() == 0);
+  IR_ELSEIF(LoadWorld)
+    JIT_ASSERT(inputs_.size() == 0);
+    JIT_ASSERT(outputs_.size() == 1);
+  IR_ELSEIF(StoreWorld)
+    JIT_ASSERT(inputs_.size() == 1);
+    JIT_ASSERT(outputs_.size() == 0);
   IR_ELSEIF(Return)
+    // Return uses is zero
     JIT_ASSERT(outputs().size() == 0);
   IR_ELSEIF(Param)
+    // Param inputs is zero
     JIT_ASSERT(inputs_.size() == 0);
   IR_ELSEIFM_CONST(PythonOp)
+    // Python operator cconv is correct
     size_t n_scalars = 0, n_tensors = 0;
     for (auto c : value->cconv) {
       if (c == 'c') {
@@ -381,6 +385,7 @@ void Graph::lint() const {
       for (auto n : b->nodes()) {
         JIT_ASSERT(n->kind_ != prim::Param);
         JIT_ASSERT(n->kind_ != prim::Return);
+        JIT_ASSERT(n->kind_ != prim::DummyWorld);
         check_node(n);
       }
 
@@ -447,6 +452,7 @@ void Block::cloneFrom(Block * src, std::function<Value*(Value*)> value_map) {
     local_map[input] = this->addInput()->copyMetadata(input)->setStage(input->stage());
     graph->setStage(std::max(graph->stage(), input->stage()));
   }
+
   for(auto node : src->nodes()) {
     auto new_node = this->appendNode(graph->createClone(node, env));
     new_node->setStage(node->stage());
@@ -466,8 +472,9 @@ void Block::cloneFrom(Block * src, std::function<Value*(Value*)> value_map) {
 
 std::shared_ptr<Graph> Graph::copy() {
   auto new_g = std::make_shared<Graph>();
-  auto env = [](Value *) -> Value* {
-    AT_ERROR("Graph::copy() encountered a use of a value not in scope. Run lint!");
+  auto env = [](Value* v) -> Value* {
+    AT_ERROR(
+        "Graph::copy() encountered a use of a value not in scope. Run lint!");
   };
   new_g->block()->cloneFrom(this->block(), env);
   return new_g;
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 062d0422c2be07..0bb5c899c7321d 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -236,7 +236,7 @@ struct Value {
 
   void replaceFirstUseWith(Value * newValue);
 
-  // Replaces all uses of this node with 'newValue'.
+  // Replaces all uses of this value with 'newValue'.
   //
   // Given:   %3 = f(%1, %2)
   //          %4 = g(%3)
@@ -320,6 +320,9 @@ struct Node : public Attributes<Node> {
   Block * owningBlock() {
     return owning_block_;
   }
+  const Block * owningBlock() const {
+    return owning_block_;
+  }
   size_t stage() const {
     return stage_;
   }
@@ -442,33 +445,33 @@ struct Node : public Attributes<Node> {
   // Given:   %3 = f(%1, %2)
   // Execute: %3.addInput(%4)
   // Result:  %3 = f(%1, %2, %4)
-  Value* addInput(Value * node) {
-    JIT_ASSERT(graph_ == node->owningGraph());
+  Value* addInput(Value * value) {
+    JIT_ASSERT(graph_ == value->owningGraph());
     schema_ = nullptr;
-    node->uses_.emplace_back(this, inputs_.size());
-    inputs_.push_back(node);
-    return node;
+    value->uses_.emplace_back(this, inputs_.size());
+    inputs_.push_back(value);
+    return value;
   }
 
-  // Add 'node' as an input to 'this' at the specified position in the
-  // arguments. Returns the added node for ease of chaining.
-  Value* insertInput(size_t i, Value* node) {
-    JIT_ASSERT(graph_ == node->owningGraph());
+  // Add 'value' as an input to 'this' at the specified position in the
+  // arguments. Returns the added value for ease of chaining.
+  Value* insertInput(size_t i, Value* value) {
+    JIT_ASSERT(graph_ == value->owningGraph());
     schema_ = nullptr;
     // First we update the offsets for all existing inputs that will reside
     // after the one we're inserting. Concretely, these are the inputs at
     // indices [i, # input). Since we're inserting one input before all of
-    // these inputs, increment their use offsets for this Node by 1
+    // these inputs, increment their use offsets for this value by 1
     for (size_t use_itr = i; use_itr < inputs_.size(); ++use_itr) {
       // See Note [User node does not uniquely identify use]
       auto use = findUseForInput(use_itr);
       use->offset += 1;
     }
     // Insert the actual input at the specified index
-    inputs_.insert(inputs_.begin() + i, node);
+    inputs_.insert(inputs_.begin() + i, value);
     // Register the new use of the value we're inserted as an input.
-    node->uses_.emplace_back(this, i);
-    return node;
+    value->uses_.emplace_back(this, i);
+    return value;
   }
 
   // Replace the input of 'this' at position 'i' with
@@ -549,7 +552,7 @@ struct Node : public Attributes<Node> {
     return {blocks_.data(), blocks_.size()};
   }
 
-  // Insert unattached 'this' node after 'n' in the topological order.
+  // Insert unattached 'this' node before 'n' in the topological order.
   // Returns this (for chaining).
   //
   // Given:   %3 = f(%1, %2)
@@ -804,8 +807,8 @@ struct Block {
   void eraseInput(size_t i) {
     input_->eraseOutput(i);
   }
-  size_t registerOutput(Value * n) {
-    output_->addInput(n);
+  size_t registerOutput(Value * v) {
+    output_->addInput(v);
     return outputs().size() - 1;
   }
   size_t insertOutput(size_t i, Value* n) {
@@ -1107,6 +1110,12 @@ friend struct Block;
     return jit::insertConstant(*this, std::move(val), loc);
   }
 
+  Value* insertDummyWorld() {
+    auto node = create(prim::DummyWorld, 1);
+    node->output()->setType(WorldType::get());
+    return insertNode(node)->output();
+  }
+
   // schema-driven insert
   // this inserts a node into the graph with inputs determined from args and kwargs using Python
   // argument matching rules, and checks that the op matches a known schema
@@ -1323,11 +1332,11 @@ inline void Node::cloneFrom(Node * s) {
 	copyAttributes(*s);
 }
 
-inline Block::Block(Graph * graph_, Node * node_)
-: graph_(graph_)
-, output_(initOutput(graph_->create(prim::Return, 0)))
-, input_(graph_->create(prim::Param,0))
-, owning_node_(node_) {
+inline Block::Block(Graph* graph_, Node* node_)
+    : graph_(graph_),
+      output_(initOutput(graph_->create(prim::Return, 0))),
+      input_(graph_->create(prim::Param, 0)),
+      owning_node_(node_) {
   graph_->all_blocks.emplace(this);
   output_->owning_block_ = this;
   input_->owning_block_ = this;
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index 6a4a699c24f9ad..d701b536c44a0c 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -58,6 +58,7 @@ struct SchemaParser {
       {"float", FloatType::get() },
       {"int", IntType::get() },
       {"bool", IntType::get() }, // TODO: add separate bool type
+      {"World", WorldType::get() },
     };
     auto tok = L.expect(TK_IDENT);
     auto text = tok.text();
diff --git a/torch/csrc/jit/passes/annotate_effects.cpp b/torch/csrc/jit/passes/annotate_effects.cpp
new file mode 100644
index 00000000000000..b8aaec83dddbac
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_effects.cpp
@@ -0,0 +1,320 @@
+#include "torch/csrc/jit/passes/annotate_effects.h"
+
+#include <set>
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+namespace torch {
+namespace jit {
+namespace {
+
+/**
+ * AnnotateEffects
+ *
+ * This pass annotates effectful operations (such as ones that mutate existing
+ * values) to prevent subsequent passes from re-ordering ops in a way that
+ * changes the meaning of the program.
+ *
+ * It does this by threading a "world token" value through nodes that use
+ * mutable values. This models effects explicitly in the IR and forces all
+ * annotated nodes to be linearized during optimization.
+ *
+ * For mutating operators: the world token is threaded directly through the node
+ * For purely functional operators: their node will be "fenced" by two
+ *   `prim::MemoryFence` nodes that take world tokens as their input.
+ *
+ * Graphs have special EntryWorld and ExitWorld nodes that provide end-points
+ * for the world token. They are similar to graph inputs/outputs in that they
+ * are not in the node list and only accessible via special methods.
+ *
+ * When inlined, graphs will manifest the EntryWorld/ExitWorld nodes explicitly
+ * as StoreWorld/LoadWorld ops so that they can act as endpoints where the
+ * callee "world thread" can be joined to the caller world thread.
+ */
+class AnnotateEffectsImpl {
+ public:
+  void annotateEffects(Graph* g) {
+    if (!shouldAnnotate(g->block())) {
+      return;
+    }
+
+    // Generate the first world token
+    Value* curToken = nullptr;
+    {
+      WithInsertPoint guard(*g->nodes().begin());
+      auto loadWorld = g->insertNode(g->create(prim::LoadWorld, 1));
+      curToken = loadWorld->output()->setType(WorldType::get());
+    }
+
+    auto lastToken = visitBlock(g->block(), curToken);
+
+    auto storeWorld = g->insertNode(g->create(prim::StoreWorld, 0));
+    storeWorld->addInput(lastToken);
+  }
+
+ private:
+  Value* visitBlock(Block* block, Value* curToken) {
+    for (auto* node : block->nodes()) {
+      curToken = visitNode(node, curToken);
+    }
+    return curToken;
+  }
+
+  // General node annotation. If a node uses a mutable variable (or mutates a
+  // previously constant variable), annotate it
+  //
+  // Returns the last world token emitted for subsequent annotations to use.
+  Value* visitNode(Node* node, Value* curToken) {
+    // Avoid annotating memory fences. This avoids an infinite loop as we add
+    // fences and continue to iterate through nodes.
+    if (node->kind() == prim::MemoryFence) {
+      // Return this memory fence's world token
+      return node->outputs().at(0);
+    }
+
+    // Handle inlined functions. Inlined functions will expose their Entry and
+    // Exit tokens as regular nodes. These exposed nodes provide fixed points
+    // to thread the current world token through.
+    if (node->kind() == prim::LoadWorld) {
+      auto inlinedEntryToken = node->output();
+      inlinedEntryToken->replaceAllUsesWith(curToken);
+      return curToken;
+    }
+
+    if (node->kind() == prim::StoreWorld) {
+      return node->input();
+    }
+
+    if (node->kind() == prim::If) {
+      JIT_ASSERT(node->blocks().size() == 2);
+
+      auto trueBlock = node->blocks().at(0);
+      auto falseBlock = node->blocks().at(1);
+
+      auto trueToken = visitBlock(trueBlock, curToken);
+      auto falseToken = visitBlock(falseBlock, curToken);
+
+      // If any branch has a mutating op, this node has to output a world token
+      if (trueToken != curToken || falseToken != curToken) {
+        trueBlock->registerOutput(trueToken);
+        falseBlock->registerOutput(falseToken);
+
+        return node->addOutput()->setType(WorldType::get());
+      }
+      return curToken;
+    }
+
+    if (node->kind() == prim::Loop) {
+      JIT_ASSERT(node->blocks().size() == 1);
+      auto block = node->blocks().at(0);
+      if (!shouldAnnotate(block)) {
+        // Bail out early if there's no mutable variables used inside
+        return curToken;
+      }
+
+      // Register the world token as a loop carried dependency
+      auto beginLoopToken = block->addInput()->setType(WorldType::get());
+      auto endLoopToken = visitBlock(block, beginLoopToken);
+      block->registerOutput(endLoopToken);
+
+      JIT_ASSERT(endLoopToken != beginLoopToken);
+
+      // Thread the world token through the loop node
+      node->addInput(curToken);
+      return node->addOutput()->setType(WorldType::get());
+    }
+
+    // For mutating ops, just thread the world token through the node.
+    if (isMutatingOp(node)) {
+      // Replace the "dummy" token generated by the compiler
+      node->replaceInput(0, curToken);
+      return node->outputs().at(0);
+    }
+
+    JIT_ASSERT(node->blocks().size() == 0);
+
+    // For pure ops that need to be annotated, fence them.
+    if (shouldAnnotate(node)) {
+      if (isFenced(node)) {
+        // If the node has already been fenced, just return the value from the
+        // end fence. This can happen when another graph is inlined.
+        return getTokenForFencedNode(node);
+      }
+      return addFenceForNode(node, curToken);
+    }
+
+    return curToken;
+  }
+
+  bool shouldAnnotate(const Node* node) {
+    // Check if this node uses a known mutable value
+    for (const auto* input : node->inputs()) {
+      if (!isMutableType(input)) {
+        // TODO(suo): Right now, we only support mutable lists.
+        // If we remove this check, it's not clear whether:
+        //
+        //   append(int[] a, int b)
+        //
+        // mutates `a` or `b`. We'll need to extend the schema language to be
+        // able to express which argument is mutated.
+        continue;
+      }
+      // First check the cache
+      if (mutableValues_.count(input) != 0) {
+        return true;
+      }
+
+      // Check whether any mutating op uses this input
+      for (const auto& use : input->uses()) {
+        if (isMutatingOp(use.user)) {
+          mutableValues_.insert(input);
+          return true;
+        }
+      }
+    }
+
+    // Check that any sub-blocks need to be annotated
+    for (auto block : node->blocks()) {
+      if (shouldAnnotate(block)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  bool shouldAnnotate(const Block* block) {
+    return std::any_of(
+        block->nodes().begin(), block->nodes().end(), [this](const Node* node) {
+          return shouldAnnotate(node);
+        });
+  }
+
+  bool isMutableType(const Value* value) {
+    return value->type()->kind() == TypeKind::ListType;
+  }
+
+  bool isMutatingOp(const Node* node) {
+    return !node->inputs().empty() &&
+        node->inputs()[0]->type() == WorldType::get();
+  }
+
+  // Returns true iff this node has already been fenced. This can happen if
+  // another graph was inlined into the current one.
+  bool isFenced(const Node* node) {
+    // A node is fenced if all its inputs/outputs are used by memory fences.
+    const auto inputsFenced = std::all_of(
+        node->inputs().begin(), node->inputs().end(), [&](const Value* input) {
+          return std::any_of(
+              input->uses().cbegin(),
+              input->uses().cend(),
+              [&](const Use& use) {
+                return use.user->kind() == prim::MemoryFence;
+              });
+        });
+    if (!inputsFenced) {
+      return false;
+    }
+
+    const auto outputsFenced = std::all_of(
+        node->outputs().begin(),
+        node->outputs().end(),
+        [&](const Value* input) {
+          return std::any_of(
+              input->uses().cbegin(),
+              input->uses().cend(),
+              [&](const Use& use) {
+                return use.user->kind() == prim::MemoryFence;
+              });
+        });
+    if (!outputsFenced) {
+      return false;
+    }
+
+    return true;
+  }
+
+  // Given a fenced node, return the world token outputted from its end fence
+  Value* getTokenForFencedNode(const Node* node) {
+    // Take advantage of the fact that the end fence consumes the node's
+    // outputs, i.e. it will be the only user.
+    const auto output = node->outputs().at(0);
+    JIT_ASSERT(output->uses().size() == 1);
+    const auto endFence = output->uses()[0].user;
+    const auto token = endFence->outputs().at(0);
+    JIT_ASSERT(token->type() == WorldType::get());
+    return token;
+  }
+
+  // Create a memory fence around a node, using the world token.
+  //
+  // Input:
+  //  %size : Int = prim::len(%mut_list)
+  //
+  // Output:
+  //  %t.1 : World, %list.2 : int[] = prim::MemoryFence(%curToken, %mut_list)
+  //  %size : Int = prim::len(%mut_list)
+  //  %t.2 : World, %size.2 : int = prim::MemoryFence(%t.1, %size)
+  //
+  // Returns the new world token (%t.2) for subsequent fences to use.
+  Value* addFenceForNode(Node* node, Value* curToken) {
+    // Add a start fence
+    auto startFence =
+        node->owningGraph()->create(prim::MemoryFence, /*outputs=*/0);
+
+    // Add world tokens as the first input and output
+    startFence->addInput(curToken);
+    curToken = startFence->addOutput()->setType(WorldType::get());
+
+    // Fence off all node's inputs
+    for (const auto input : node->inputs()) {
+      startFence->addInput(input);
+      startFence->addOutput()->setType(input->type());
+    }
+
+    startFence->insertBefore(node);
+
+    JIT_ASSERT(node->inputs().size() == startFence->outputs().size() - 1);
+
+    // modify the node to take in the start fence's output values
+    for (size_t i = 0; i < node->inputs().size(); i++) {
+      node->replaceInput(i, startFence->outputs()[i + 1]);
+    }
+
+    // Add an end fence
+    auto endFence =
+        node->owningGraph()->create(prim::MemoryFence, /*outputs=*/0);
+
+    // Add world tokens as the first input and output
+    endFence->addInput(curToken);
+    curToken = endFence->addOutput()->setType(WorldType::get());
+
+    // Fence off all the node's outputs
+    for (auto output : node->outputs()) {
+      endFence->addInput(output);
+      auto fencedOutput = endFence->addOutput()->setType(output->type());
+      output->replaceAllUsesWith(fencedOutput);
+      // replaceAllUsesWith() replaces the fence's INPUT value with the new
+      // output as well, so we need to manually add the "real" input back
+      endFence->replaceInputWith(fencedOutput, output);
+    }
+
+    endFence->insertAfter(node);
+
+    return curToken;
+  }
+
+  // Memoize which values will be mutated at some point in the program
+  std::set<const Value*> mutableValues_;
+};
+} // namespace
+
+void AnnotateEffects(std::shared_ptr<Graph>& graph) {
+  AnnotateEffectsImpl impl;
+  impl.annotateEffects(graph.get());
+
+  // Prune the dummy world tokens
+  EliminateDeadCode(graph);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/annotate_effects.h b/torch/csrc/jit/passes/annotate_effects.h
new file mode 100644
index 00000000000000..9c8e969d54ba41
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_effects.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch {
+namespace jit {
+
+TORCH_API void AnnotateEffects(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index b9d36d0e4b88e3..179f3751526c4e 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -17,6 +17,8 @@ std::unordered_set<Symbol> skip_list = {
   prim::Loop, //TODO: handle Loop
   prim::Print,
   prim::PythonOp, //may have side effects
+  prim::LoadWorld,
+  prim::StoreWorld,
   //all the rand functions from native_functions.yaml
   aten::rand,
   aten::rand_like,
diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp
index d8341cbb99c6aa..6424eb70a6cafc 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.cpp
+++ b/torch/csrc/jit/passes/dead_code_elimination.cpp
@@ -13,12 +13,13 @@ bool hasSideEffects(Node * node, bool_memo_type& memo) {
   auto it = memo.find(node);
   if (it != memo.end())
     return it->second;
-  bool has_side_effects = node->kind() == prim::Print ||
-    std::any_of(node->blocks().begin(), node->blocks().end(),
-                [&](Block *b) {
-                  return std::any_of(b->nodes().begin(), b->nodes().end(),
-                                    [&](Node *n) { return hasSideEffects(n, memo); });
-                });
+  bool has_side_effects =
+      node->kind() == prim::Print || node->kind() == prim::StoreWorld ||
+      std::any_of(node->blocks().begin(), node->blocks().end(), [&](Block* b) {
+        return std::any_of(b->nodes().begin(), b->nodes().end(), [&](Node* n) {
+          return hasSideEffects(n, memo);
+        });
+      });
   memo.emplace(node, has_side_effects);
   return has_side_effects;
 }
diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp
index f78da9b92baccc..0d56ca2255286f 100644
--- a/torch/csrc/jit/passes/to_batch.cpp
+++ b/torch/csrc/jit/passes/to_batch.cpp
@@ -525,11 +525,10 @@ void ToBatch::toBatch(Block* block, Block* res_block) {
 }
 
 std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph){
-  // std::cout<<graph->toString()<<std::endl;
   std::shared_ptr<Graph> res_graph = std::make_shared<Graph>(graph->scope_root());
   ToBatch to_batch;
   to_batch.toBatch(graph->block(), res_graph->block());
-  // std::cout<<res_graph->toString()<<std::endl;
+
   return res_graph;
 }
 
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 5a012970f2c4f6..192cabca3f38d1 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -86,7 +86,7 @@ inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) {
   for(auto elem : obj) {
     elems.push_back(toIValue(elem, elem_type));
   }
-  return ConstantList<IValue>::create(std::move(elems));
+  return List<IValue>::create(std::move(elems));
 }
 
 inline IValue toIValue(py::handle obj, const TypePtr& type) {
@@ -140,6 +140,8 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
             return createGenericList(obj, elem_type);
         }
       }
+      case TypeKind::WorldType:
+        AT_ERROR("World arguments should not be passed in by users");
       case TypeKind::NumberType:
       case TypeKind::GeneratorType:
       case TypeKind::VarType:
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 0db6f9a394459c..ad03ac556cd272 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -457,6 +457,8 @@ void initPythonIRBindings(PyObject * module_) {
           return "GeneratorType";
         case TypeKind::VarType:
           return "VarType";
+        case TypeKind::WorldType:
+          return "WorldType";
         }
         // not reachable, but some compilers complain
         AT_ERROR("Unknown Type Kind");
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 574d9ca1446396..be57a03c9ff48e 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -52,6 +52,13 @@ void checkImplicitTensorToNum(at::Tensor t, bool toInt) {
 }
 
 RegisterOperators reg({
+    Operator(
+        prim::MemoryFence,
+        [](Node* node) {
+          return [](Stack& stack) {
+            return 0;
+          };
+        }),
     Operator(
         prim::FusionGroup,
         [](Node* node) {
@@ -204,6 +211,30 @@ RegisterOperators reg({
             return 0;
           };
         }),
+    Operator(
+        prim::LoadWorld,
+        [](Node* node) {
+          return [](Stack& stack) {
+            push(stack, World{0});
+            return 0;
+          };
+        }),
+    Operator(
+        prim::StoreWorld,
+        [](Node* node) {
+          return [](Stack& stack) {
+            drop(stack, 1);
+            return 0;
+          };
+        }),
+    Operator(
+        prim::DummyWorld,
+        [](Node* node) {
+          return [](Stack& stack) {
+            AT_ERROR("Encountered a dummy world during graph execution.");
+            return 0;
+          };
+        }),
     Operator(
         onnx::Reshape,
         [](Node* node) {
@@ -479,6 +510,19 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size) {
   return idx;
 }
 
+template <typename TList, typename TElement>
+Operation listAppend(Node* node) {
+  return [](Stack& stack) {
+    TList a;
+    TElement el;
+    pop(stack, a, el);
+
+    a->elements().push_back(el);
+
+    return 0;
+  };
+}
+
 template <typename T>
 Operation listSelect(Node* node) {
   return [=](Stack& stack) {
@@ -615,7 +659,10 @@ RegisterOperators reg2({
     Operator("aten::add(" decl_type "[] a, " decl_type "[] b) -> " decl_type "[]", listAdd<Shared<c_type>, c_type::ElemType>), \
     Operator( \
         "aten::slice(" decl_type "[] l, int start, int end=9223372036854775807, int step=1) -> " decl_type "[]", \
-        listSlice<Shared<c_type>, c_type::ElemType>),
+        listSlice<Shared<c_type>, c_type::ElemType>), \
+    Operator( \
+        "aten::append(World w, " decl_type "[] list, " decl_type " el) -> World", \
+        listAppend<Shared<c_type>, c_type::ElemType>), \
 
 
     CREATE_LIST_OPS("int", IntList)
@@ -623,11 +670,11 @@ RegisterOperators reg2({
     CREATE_LIST_OPS("Tensor", TensorList)
     CREATE_LIST_OPS("t", GenericList)
 
+
     Operator("aten::eq(int[] a, int[] b) -> int", listEq<Shared<IntList>>),
     Operator("aten::eq(float[] a, float[] b) -> int", listEq<Shared<DoubleList>>),
     Operator("aten::eq(Tensor[] a, Tensor[] b) -> int", listEq<Shared<TensorList>>),
 
-
     DEFINE_BINARY_OP(aten::add, a + b)
     DEFINE_BINARY_OP(aten::sub, a - b)
     DEFINE_BINARY_OP(aten::mul, a * b)
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 384cc167735e55..3b5c523ce439c2 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -1,5 +1,6 @@
 #include "torch/csrc/jit/script/compiler.h"
 #include "torch/csrc/jit/passes/lower_tuples.h"
+#include "torch/csrc/jit/passes/annotate_effects.h"
 #include "torch/csrc/jit/operator.h"
 #include "torch/csrc/jit/interpreter.h"
 #include "torch/csrc/jit/ir.h"
@@ -549,122 +550,128 @@ static Value* materializeConstant(T val, Graph& graph,
 }
 
 at::optional<MatchedSchema> tryMatchSchema(
-  const FunctionSchema& schema,
-  const SourceRange& loc,
-  Graph& graph,
-  at::ArrayRef<NamedValue> args,
-  at::ArrayRef<NamedValue> kwargs,
-  std::ostream& failure_messages,
-  bool convert_tensors_to_nums) {
-    auto err = [&]() -> std::ostream& {
-      failure_messages << "\nfor operator " << schema << ":\n";
-      return failure_messages;
-    };
+    const FunctionSchema& schema,
+    const SourceRange& loc,
+    Graph& graph,
+    at::ArrayRef<NamedValue> raw_args,
+    at::ArrayRef<NamedValue> kwargs,
+    std::ostream& failure_messages,
+    bool convert_tensors_to_nums) {
+  // Match against a potentially mutable schema.
+  //
+  // We need to treat mutable schemas differently because the IR explicitly
+  // expresses effects by including a world token in mutable ops. Users do not
+  // know about the world token, so we need to generate a dummy one and add
+  // it to the inputs for schema matching.
+  //
+  // Example:
+  //   append(int[] list, int el)
+  // becomes
+  //   append(World w, int[] list, int el)
+  //
+  // NOTE: The dummy world token has no meaning; the AnnotateEffects pass is
+  // necessary to enforce linearization on effectful ops.
+  std::vector<NamedValue> modifiedArgs(raw_args.begin(), raw_args.end());
+  if (schema.is_mutable) {
+    // Add a dummy world token to be matched against
+    const auto worldToken = graph.insertDummyWorld();
+    modifiedArgs.insert(modifiedArgs.begin(), worldToken);
+  }
+  auto err = [&]() -> std::ostream& {
+    failure_messages << "\nfor operator " << schema << ":\n";
+    return failure_messages;
+  };
 
-    TypeEnv type_env;
-    std::vector<Value*> positional_inputs;
-    std::vector<bool> used_kwarg(kwargs.size(), false);
-
-    // if we finish the loop will we have consumed all arguments?
-    size_t used_args = 0;
-
-    for(size_t schema_i = 0; schema_i < schema.arguments.size(); ++schema_i) {
-      const auto& arg = schema.arguments[schema_i];
-      at::optional<NamedValue> v;
-      if(!arg.kwarg_only && schema_i < args.size()) {
-
-        // allow zeros(IntList sizes) to work with zeros(1, 2) or zeros(1)
-        if (arg.type->kind() == TypeKind::ListType && // the formal must be a list
-            !arg.N && // it must not be a broadcasting list like int[3], otherwise a single int is a valid input
-            (schema_i + 1 == schema.arguments.size() || schema.arguments[schema_i + 1].kwarg_only)) {  // must be the last position argument
-          auto actual_type = args[schema_i].value(graph)->type();
-          if (actual_type->kind() != TypeKind::ListType && !convertibleToList(actual_type, arg.type)) { // and the actual should not be a list already
-            auto elem_type = arg.type->expect<ListType>()->getElementType();
-            Value* list = tryCreateList(elem_type, graph, loc, args.slice(schema_i),
-              err, convert_tensors_to_nums, type_env);
-            if(!list)
-              return at::nullopt;
-            used_args = args.size();
-            positional_inputs.push_back(list);
-            continue;
-          }
+  TypeEnv type_env;
+  std::vector<Value*> positional_inputs;
+  std::vector<bool> used_kwarg(kwargs.size(), false);
+
+  // if we finish the loop will we have consumed all arguments?
+  size_t used_args = 0;
+
+  for (size_t schema_i = 0; schema_i < schema.arguments.size(); ++schema_i) {
+    const auto& arg = schema.arguments[schema_i];
+    at::optional<NamedValue> v;
+    if (!arg.kwarg_only && schema_i < modifiedArgs.size()) {
+      // allow zeros(IntList sizes) to work with zeros(1, 2) or zeros(1)
+      if (arg.type->kind() == TypeKind::ListType && // the formal must be a list
+          !arg.N && // it must not be a broadcasting list like int[3], otherwise
+                    // a single int is a valid input
+          (schema_i + 1 == schema.arguments.size() ||
+           schema.arguments[schema_i + 1]
+               .kwarg_only)) { // must be the last position argument
+        auto actual_type = modifiedArgs[schema_i].value(graph)->type();
+        if (actual_type->kind() != TypeKind::ListType &&
+            !convertibleToList(
+                actual_type,
+                arg.type)) { // and the actual should not be a list already
+          auto elem_type = arg.type->expect<ListType>()->getElementType();
+          Value* list = tryCreateList(
+              elem_type,
+              graph,
+              loc,
+              at::ArrayRef<NamedValue>(modifiedArgs).slice(schema_i),
+              err,
+              convert_tensors_to_nums,
+              type_env);
+          if (!list)
+            return at::nullopt;
+          used_args = modifiedArgs.size();
+          positional_inputs.push_back(list);
+          continue;
         }
+      }
 
-        v = args[schema_i];
-        used_args++;
-      } else if(auto idx = findInputWithName(arg.name, kwargs))  {
-        const NamedValue& nv = kwargs[*idx];
-        if(used_kwarg[*idx]) {
-          err() << "argument " << nv.name() << " specified twice in schema, submit a bug report!\n" << nv.locOr(loc);
-          return at::nullopt;
-        }
-        used_kwarg[*idx] = true;
-        v = nv;
-      } else if(arg.default_value) {
-        v = NamedValue(*arg.default_value);
-      } else {
-        err() << "argument " << schema.arguments[schema_i].name << " not provided.\n" << loc;
+      v = modifiedArgs[schema_i];
+      used_args++;
+    } else if (auto idx = findInputWithName(arg.name, kwargs)) {
+      const NamedValue& nv = kwargs[*idx];
+      if (used_kwarg[*idx]) {
+        err() << "argument " << nv.name()
+              << " specified twice in schema, submit a bug report!\n"
+              << nv.locOr(loc);
         return at::nullopt;
       }
-      Value * positional = tryMatchArgument(arg, graph, loc, *v, err, convert_tensors_to_nums, type_env);
-      if(!positional)
-        return at::nullopt;
-      positional_inputs.push_back(positional);
-    }
-
-    // check for unused positional arguments
-    if(used_args < args.size()) {
-      err() << "expected at most " << used_args << " arguments "
-      << "but found " << args.size() << " positional arguments.\n" << loc << "\n";
+      used_kwarg[*idx] = true;
+      v = nv;
+    } else if (arg.default_value) {
+      v = NamedValue(*arg.default_value);
+    } else {
+      err() << "argument " << schema.arguments[schema_i].name
+            << " not provided.\n"
+            << loc;
       return at::nullopt;
     }
-    // check for unused kwargs
-    for(size_t i = 0; i < kwargs.size(); ++i) {
-      const auto& nv = kwargs[i];
-      if (!used_kwarg[i]) {
-        if(!schema.argumentIndexWithName(nv.name())) {
-          err() << "keyword argument " << nv.name() << " unknown\n";
-        } else {
-          err() << "keyword argument " << nv.name() << " specified twice\n";
-        }
-        return at::nullopt;
+    Value* positional = tryMatchArgument(
+        arg, graph, loc, *v, err, convert_tensors_to_nums, type_env);
+    if (!positional)
+      return at::nullopt;
+    positional_inputs.push_back(positional);
+  }
+
+  // check for unused positional arguments
+  if (used_args < modifiedArgs.size()) {
+    err() << "expected at most " << used_args << " arguments "
+          << "but found " << modifiedArgs.size() << " positional arguments.\n"
+          << loc << "\n";
+    return at::nullopt;
+  }
+  // check for unused kwargs
+  for (size_t i = 0; i < kwargs.size(); ++i) {
+    const auto& nv = kwargs[i];
+    if (!used_kwarg[i]) {
+      if (!schema.argumentIndexWithName(nv.name())) {
+        err() << "keyword argument " << nv.name() << " unknown\n";
+      } else {
+        err() << "keyword argument " << nv.name() << " specified twice\n";
       }
+      return at::nullopt;
     }
-    auto return_types = fmap(schema.returns, [&](const Argument& r) {
-      return evalTypeVariables(r.type, type_env);
-    });
-    return MatchedSchema {std::move(positional_inputs), std::move(return_types) };
-}
-
-
-static Value* tryEmitBuiltin(
-  const std::shared_ptr<Operator>& op,
-  std::stringstream& failure_messages,
-  const SourceRange& loc,
-  Graph& graph,
-  Symbol name,
-  at::ArrayRef<NamedValue> inputs,
-  at::ArrayRef<NamedValue> attributes,
-  bool convert_tensors_to_nums) {
-
-  auto matched_schema = tryMatchSchema(op->schema(), loc, graph, inputs, attributes,
-    failure_messages, convert_tensors_to_nums);
-  if(!matched_schema)
-    return nullptr;
-  // we successfully matched this schema, construct the node
-
-  auto n = graph.insertNode(graph.create(name, matched_schema->inputs, 0))
-                ->setSourceLocation(std::make_shared<SourceRange>(loc));
-
-  for(auto & ret : matched_schema->return_types) {
-    n->addOutput()->setType(ret);
   }
-
-  // assert that we did indeed create an op that has implementation
-  // otherwise schema and dispatch are not in sync
-  getOperation(n);
-
-  return packOutputs(graph, n->outputs());
+  auto return_types = fmap(schema.returns, [&](const Argument& r) {
+    return evalTypeVariables(r.type, type_env);
+  });
+  return MatchedSchema{std::move(positional_inputs), std::move(return_types)};
 }
 
 static std::string prefixLine(const std::string& str, std::string prefix) {
@@ -679,6 +686,29 @@ static std::string prefixLine(const std::string& str, std::string prefix) {
   return ss.str();
 }
 
+// Given a successful match between operator schema and symbol, emit a node
+// with the appropriate inputs and outputs.
+static Value* emitBuiltinNode(
+    const MatchedSchema& matched_schema,
+    const SourceRange& loc,
+    Graph& graph,
+    Symbol name) {
+  auto n = graph.insertNode(graph.create(name, matched_schema.inputs, 0))
+                ->setSourceLocation(std::make_shared<SourceRange>(loc));
+
+  for(auto & ret : matched_schema.return_types) {
+    n->addOutput()->setType(ret);
+  }
+
+  // assert that we did indeed create an op that has implementation
+  // otherwise schema and dispatch are not in sync
+  getOperation(n);
+
+  return packOutputs(graph, n->outputs());
+}
+
+// Search for operators matching the provided symbol name and input types.
+// If one is found, emit a node to the graph for that operator.
 Value* emitBuiltinCall(
   const SourceRange& loc,
   Graph& graph,
@@ -694,20 +724,27 @@ Value* emitBuiltinCall(
   std::stringstream failure_messages;
   //first we try to match the schema without any conversion
   //if no schema matches then insert ImplicitTensorToNum
-  for(bool convert_tensors_to_nums : {false, true}) {
-    //clear previous error messages
+  for (bool convert_tensors_to_nums : {false, true}) {
+    // clear previous error messages
     failure_messages.str("");
     for (const std::shared_ptr<Operator>& op : variants) {
-      if (auto result = tryEmitBuiltin(
-              op, failure_messages, loc, graph, name, inputs, attributes,
-              convert_tensors_to_nums)) {
-        return result;
+      const auto matched_schema = tryMatchSchema(
+          op->schema(),
+          loc,
+          graph,
+          inputs,
+          attributes,
+          failure_messages,
+          convert_tensors_to_nums);
+
+      if (matched_schema) {
+        return emitBuiltinNode(*matched_schema, loc, graph, name);
       }
     }
   }
 
   // none of the options worked
-  if(!required) {
+  if (!required) {
     return nullptr;
   }
   if(variants.size() == 0) {
@@ -736,8 +773,8 @@ std::shared_ptr<SugaredValue> BuiltinFunction::call(
   if (value)
     inputs.push_back(*value);
   inputs.insert(inputs.end(), inputs_.begin(), inputs_.end());
-  return std::make_shared<SimpleValue>(
-      emitBuiltinCall(loc, *m.graph(), symbol, inputs, attributes, true));
+  return std::make_shared<SimpleValue>(emitBuiltinCall(
+      loc, *m.graph(), symbol, inputs, attributes, true));
 }
 
 inline bool isSupportedListElementType(TypePtr type) {
@@ -844,6 +881,8 @@ struct to_ir {
     }
 
     method.setSchema({def.name().name(), std::move(arguments), std::move(returns)});
+    // annotate effects to prevent reordering
+    AnnotateEffects(graph);
     // remove any uses of tuples that we inserted that are not needed
     LowerSimpleTuples(graph);
   }
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index bc559d8868daae..855adad429191f 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -57,6 +57,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << "Generator";
   } else if(t.kind() == TypeKind::VarType) {
     out << t.expect<VarType>()->name();
+  } else if(t.kind() == TypeKind::WorldType) {
+    out << "World";
   } else {
     AT_ERROR("unknown type kind");
   }
@@ -91,6 +93,10 @@ GeneratorTypePtr GeneratorType::get() {
   static auto value = GeneratorType::create();
   return value;
 }
+WorldTypePtr WorldType::get() {
+  static auto value = WorldType::create();
+  return value;
+}
 StringTypePtr StringType::get() {
   static auto value = StringType::create();
   return value;
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index b519c6f710b21f..49748de239e2b2 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -27,7 +27,8 @@ _(IntType) \
 _(NoneType) \
 _(StringType) \
 _(GeneratorType) \
-_(VarType)
+_(VarType) \
+_(WorldType) \
 
 enum class TypeKind {
 #define DEFINE_TYPE(T) T,
@@ -370,6 +371,32 @@ struct TORCH_API CompleteTensorType : public TensorType {
   std::vector<int64_t> strides_;
 };
 
+// This type is a token used to represent effectful computation in the IR.
+// See the AnnotateEffects pass for how it is used.
+struct WorldType;
+using WorldTypePtr = std::shared_ptr<WorldType>;
+struct TORCH_API WorldType : public Type {
+  template <typename... T>
+  static WorldTypePtr create(T&&... all) {
+    return WorldTypePtr(new WorldType(std::forward<T>(all)...));
+  }
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "world";
+  }
+  bool isSubtypeOf(const TypePtr rhs) const override {
+    return *this == *rhs;
+  }
+  static const TypeKind Kind = TypeKind::WorldType;
+  // global singleton
+  static WorldTypePtr get();
+
+ private:
+  WorldType() : Type(TypeKind::WorldType) {}
+};
+
 struct ListType;
 using ListTypePtr = std::shared_ptr<ListType>;
 

From 149403f8492f532b7fc5c373206d15d3d0851321 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 19:40:24 -0700
Subject: [PATCH 34/82] Move TensorImpl ndim, size, itemsize and nbytes to
 caffe2::Tensor

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12098

Reviewed By: jerryzh168

Differential Revision: D10051298

fbshipit-source-id: a833fad74bbda38c019ec2cb97d4bb6804e09963
---
 aten/src/ATen/core/TensorImpl.h | 55 +++++++++------------------------
 caffe2/core/tensor.h            | 22 ++++++++++---
 2 files changed, 33 insertions(+), 44 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 24b5789ee5ca0f..7552bfa16d8fb1 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -268,6 +268,9 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   const caffe2::TypeMeta& dtype() const {
     return data_type_;
   }
+  size_t itemsize() const {
+    return data_type_.itemsize();
+  }
 
   virtual int64_t storage_offset() const {
     return storage_offset_;
@@ -410,7 +413,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       storage_ = at::Storage(device_type(), src.meta());
       data_type_ = src.meta();
     }
-    if (src.size() == -1) {
+    if (src.numel() == -1) {
       sizes_.clear();
       numel_ = -1;
       strides_.clear();
@@ -420,7 +423,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
       return;
     }
     Resize(src.dims());
-    if (size() > 0) {
+    if (numel() > 0) {
       if (data_type_.copy()) {
         CAFFE_ENFORCE(
             device_type() == ::at::DeviceType::CPU,
@@ -428,7 +431,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         CAFFE_ENFORCE(
             src.device_type() == ::at::DeviceType::CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        data_type_.copy()(src.data(), raw_mutable_data(data_type_), size());
+        data_type_.copy()(src.data(), raw_mutable_data(data_type_), numel());
       } else {
         // We'll need to use a non-CPU context to perform the copy if
         // one of the context is not CPU since only non-CPU context
@@ -436,20 +439,20 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         if (src.device_type() != ::at::DeviceType::CPU || device_type() == ::at::DeviceType::CPU) {
           if (!context) {
             src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.data(), raw_mutable_data(data_type_), device_type());
+                numel() * itemsize(), src.data(), raw_mutable_data(data_type_), device_type());
           } else {
             CAFFE_ENFORCE(
                 context->device_type() == src.device_type(),
                 "Type for provided context does not match the type of source");
             context->CopyBytesToDevice(
-                nbytes(), src.data(), raw_mutable_data(data_type_), device_type());
+                numel() * itemsize(), src.data(), raw_mutable_data(data_type_), device_type());
           }
         } else {
           // In case source context is CPU, and target context is non-CPU
           // We'll have to create a Context from target and perform the
           // copy using that context
           CreateContext()->CopyBytesFromCPU(
-              nbytes(), src.data(), raw_mutable_data(data_type_));
+              numel() * itemsize(), src.data(), raw_mutable_data(data_type_));
         }
       }
     }
@@ -782,34 +785,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
   }
 
-  /**
-   * Returns the number of dimensions of the data.
-   */
-  inline int ndim() const {
-    return sizes_.size();
-  }
-  /**
-   * Returns the size (i.e. the number of items) of the tensor.
-   */
-  inline int64_t size() const {
-    return numel_;
-  }
-  /**
-   * Return the number of bytes each item takes in the tensor.
-   */
-  inline size_t itemsize() const {
-    return storage_.itemsize();
-  }
-  /**
-   * Returns the total number of bytes of the storage.
-   *
-   * This is equivalent to calling size() * itemsize().
-   */
-  inline size_t nbytes() const {
-    return numel_ * itemsize();
-    ;
-  }
-
   // NB: This capacity may also include available space
   // in the storage BEFORE the tensor data, if storage_offset != 0
   inline size_t capacity_nbytes() const {
@@ -839,14 +814,14 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * allowing for negative indexing (e.g., -1 for the last axis).
    *
    * @param axis_index the axis index.
-   *        If 0 <= index < ndim(), return index.
-   *        If -ndim <= index <= -1, return (ndim() - (-index)),
-   *        e.g., the last axis index (ndim() - 1) if index == -1,
+   *        If 0 <= index < dim(), return index.
+   *        If -ndim <= index <= -1, return (dim() - (-index)),
+   *        e.g., the last axis index (dim() - 1) if index == -1,
    *        the second to last if index == -2, etc.
    *        Dies on out of range index.
    */
   inline int canonical_axis_index(int axis_index) const {
-    return canonical_axis_index_(axis_index, ndim());
+    return canonical_axis_index_(axis_index, dim());
   }
 
   /**
@@ -978,8 +953,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
   inline void update_to_contiguous_strides() {
     strides_.resize(sizes_.size());
-    if (ndim() > 0) {
-      int last_idx = ndim() - 1;
+    if (dim() > 0) {
+      int last_idx = dim() - 1;
       strides_[last_idx] = 1;
       for (auto i = last_idx - 1; i >= 0; --i) {
         strides_[i] = strides_[i + 1] * std::max<int64_t>(sizes_[i + 1], 1);
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 8e670858e89284..8795ba379eeb68 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -312,20 +312,34 @@ class CAFFE2_API Tensor final {
     return impl_.get()->mutable_data<T>();
   }
 
+  /**
+   * Returns the number of dimensions of the data.
+   */
   inline int ndim() const {
-    return impl_.get()->ndim();
+    return impl_->dim();
   }
 
+  /**
+   * Returns the size (i.e. the number of items) of the tensor.
+   */
   inline int64_t size() const {
-    return impl_.get()->size();
+    return impl_->numel();
   }
 
+  /**
+   * Return the number of bytes each item takes in the tensor.
+   */
   inline size_t itemsize() const {
-    return impl_.get()->itemsize();
+    return impl_->storage().itemsize();
   }
 
+  /**
+   * Returns the total number of bytes of the storage.
+   *
+   * This is equivalent to calling size() * itemsize().
+   */
   inline size_t nbytes() const {
-    return impl_.get()->nbytes();
+    return impl_->numel() * itemsize();
   }
 
   inline const vector<int64_t>& dims() const {

From 3eb5940cf54536c88cff117d0947aeb16f05d371 Mon Sep 17 00:00:00 2001
From: Junjie Bai <jbai@fb.com>
Date: Thu, 27 Sep 2018 20:14:15 -0700
Subject: [PATCH 35/82] codemod cuda_gpu_id to device_id (#12022)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12022

codemod -d . --extensions h,cc,cpp,cu,py,proto,pbtxt,pb.txt,config cuda_gpu_id device_id

codemod with 'Yes to all'

Reviewed By: orionr

Differential Revision: D9986213

fbshipit-source-id: f5614a5d26078817aee8caf79a494abfd6a95ff1
---
 caffe2/contrib/nccl/cuda_nccl_op_gpu.cc       |  4 +-
 caffe2/contrib/nccl/nccl_ops_test.py          |  2 +-
 caffe2/contrib/prof/prof_dag_net.cc           |  4 +-
 .../tensorboard/tensorboard_exporter.py       |  2 +-
 caffe2/contrib/warpctc/ctc_ops_test.py        |  8 +-
 caffe2/core/blob_gpu_test.cc                  |  4 +-
 caffe2/core/context_gpu.cu                    |  2 +-
 caffe2/core/context_gpu.h                     |  6 +-
 caffe2/core/cudnn_wrappers.h                  |  6 +-
 caffe2/core/event_gpu.cc                      | 16 ++--
 caffe2/core/hip/event_hip.cc                  |  2 +-
 caffe2/core/memonger.cc                       |  4 +-
 caffe2/core/net_async_base.cc                 |  4 +-
 caffe2/core/net_async_dag_gpu.cc              |  2 +-
 caffe2/core/net_gpu_test.cc                   |  2 +-
 caffe2/core/operator.cc                       |  2 +-
 caffe2/mkl/utils/mkl_memory.cc                |  2 +-
 caffe2/observers/profile_observer_gpu.cc      |  4 +-
 caffe2/onnx/backend.cc                        |  2 +-
 caffe2/operators/load_save_op_gpu.cc          |  2 +-
 .../rnn/recurrent_network_executor_gpu.cc     |  4 +-
 caffe2/proto/caffe2.proto                     |  2 +-
 caffe2/python/cnn.py                          |  2 +-
 caffe2/python/core.py                         | 16 ++--
 caffe2/python/core_test.py                    | 82 +++++++++----------
 caffe2/python/data_parallel_model.py          |  6 +-
 caffe2/python/hypothesis_test_util.py         |  2 +-
 caffe2/python/model_helper.py                 |  4 +-
 caffe2/python/muji.py                         |  2 +-
 caffe2/python/net_printer.py                  |  4 +-
 caffe2/python/numa_test.py                    |  2 +-
 caffe2/python/onnx/backend_rep.py             |  2 +-
 caffe2/python/operator_test/load_save_test.py |  2 +-
 caffe2/python/operator_test/rnn_cell_test.py  |  2 +-
 caffe2/python/optimizer.py                    | 10 +--
 .../predictor/predictor_exporter_test.py      |  2 +-
 caffe2/python/pybind_state_dlpack.h           |  4 +-
 caffe2/utils/proto_utils.cc                   |  4 +-
 caffe2/utils/proto_utils_test.cc              |  4 +-
 .../pyHIPIFY/cuda_to_hip_mappings.py          |  2 +-
 40 files changed, 119 insertions(+), 119 deletions(-)

diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
index ea8b3494c6a036..4c5313ff4b3032 100644
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -11,7 +11,7 @@ nccl::NCCLExecution getNCCLElements(
   // We either do an N-N op, or an N-1 op.
   CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
   nccl::NCCLExecution ex;
-  ex.stream_gpu_id = context.cuda_gpu_id();
+  ex.stream_gpu_id = context.device_id();
   ex.stream = context.cuda_stream();
   ex.root = op->template GetSingleArgument<int>("root", 0);
   ex.elements.resize(op->InputSize());
@@ -204,7 +204,7 @@ std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
   for (int i = 0; i < def.input().size(); ++i) {
     DeviceOption dev;
     dev.set_device_type(1);
-    dev.set_cuda_gpu_id(i);
+    dev.set_device_id(i);
     opt.push_back(dev);
   }
   return std::make_pair(opt, opt);
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
index 7e8a61e9de241d..f6c22a7d750127 100644
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -21,7 +21,7 @@
 def gpu_device(i):
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.cuda_gpu_id = i
+    device_option.device_id = i
     return device_option
 
 
diff --git a/caffe2/contrib/prof/prof_dag_net.cc b/caffe2/contrib/prof/prof_dag_net.cc
index 16917ddc154fc9..c8678652c3138f 100644
--- a/caffe2/contrib/prof/prof_dag_net.cc
+++ b/caffe2/contrib/prof/prof_dag_net.cc
@@ -33,9 +33,9 @@ void ProfDAGNet::ValidateOpTensorDevices() {
       had_mismatches = true;
       LOG(INFO) << "== PERFORMANCE WARNING == \n"
                 << " Operator " << node.operator_->debug_def().type()
-                << " expects GPU " << mismatch.second.first.cuda_gpu_id()
+                << " expects GPU " << mismatch.second.first.device_id()
                 << " but tensor [" << mismatch.first << "] is on GPU "
-                << mismatch.second.second.cuda_gpu_id();
+                << mismatch.second.second.device_id();
     }
   }
   if (!had_mismatches) {
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index 93ade48e7d267d..cc2c3d85c96877 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -177,7 +177,7 @@ def _tf_device(device_option):
     if device_option.device_type == caffe2_pb2.CPU:
         return "/cpu:*"
     if device_option.device_type == caffe2_pb2.CUDA:
-        return "/gpu:{}".format(device_option.cuda_gpu_id)
+        return "/gpu:{}".format(device_option.device_id)
     raise Exception("Unhandled device", device_option)
 
 
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 25bb0a39e3a965..3b21c8b667473c 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -79,11 +79,11 @@ def test_ctc_cost_cpu(self):
     def test_ctc_cost_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=False)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=False,
             skip_input_lengths=True)
 
@@ -99,10 +99,10 @@ def test_ctc_forward_only_cpu(self):
     def test_ctc_forward_only_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=True)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=True,
             skip_input_lengths=True)
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 55eafdede7269a..8b4127e403a452 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -195,7 +195,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     }
     EXPECT_TRUE(tensor_proto.has_device_detail());
     EXPECT_EQ(tensor_proto.device_detail().device_type(), PROTO_CUDA);
-    EXPECT_EQ(tensor_proto.device_detail().cuda_gpu_id(), gpu_id);
+    EXPECT_EQ(tensor_proto.device_detail().device_id(), gpu_id);
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
@@ -205,7 +205,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if we force the restored blob on a different device, we
     // can still get so.
     blob.Reset();
-    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
+    proto.mutable_tensor()->mutable_device_detail()->set_device_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
     EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 1eaa579ee0cdbe..5ffc8c699154f2 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -251,7 +251,7 @@ CUDAContext::CUDAContext(const int gpu_id)
 
 CUDAContext::CUDAContext(const DeviceOption& option)
     : gpu_id_(
-          option.has_cuda_gpu_id() ? RectifyGPUID(option.cuda_gpu_id())
+          option.has_device_id() ? RectifyGPUID(option.device_id())
                                    : CaffeCudaGetDevice()),
       random_seed_(
           option.has_random_seed() ? option.random_seed()
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 5fcdb98b100794..afb2e93fdd7fa8 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -182,7 +182,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     }
   }
 
-  inline int cuda_gpu_id() const {
+  inline int device_id() const {
     return gpu_id_;
   }
 
@@ -281,7 +281,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   }
 
   static bool IsStreamFree(const DeviceOption& option, int stream_id) {
-    auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
+    auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
@@ -404,7 +404,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
     device->set_device_type(TypeToProto(GetDeviceType()));
-    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
+    device->set_device_id(GetGPUIDForPointer(data));
   }
 
  protected:
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index 1bd39fa62a399f..dea138e9ad507c 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -122,9 +122,9 @@ class CuDNNWrapper {
   void with_cudnn_state(size_t state_idx, F&& f) {
     CAFFE_ENFORCE(
         state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
-    auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
+    auto& sync_state = cudnn_states()[context_->device_id()][state_idx];
 
-    DeviceGuard dg(context_->cuda_gpu_id());
+    DeviceGuard dg(context_->device_id());
 
     // We need to serialize execution on the CuDNNState as we can't
     // allow multiple threads to race through the cudaEventRecord
@@ -132,7 +132,7 @@ class CuDNNWrapper {
     // execution)
     std::lock_guard<std::mutex> g(sync_state.mutex);
     if (!sync_state.state.get()) {
-      sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
+      sync_state.state.reset(new CuDNNState(context_->device_id()));
     }
     CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
   }
diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
index 6253ca19c9ab70..44aec8d3f2b8f4 100644
--- a/caffe2/core/event_gpu.cc
+++ b/caffe2/core/event_gpu.cc
@@ -9,21 +9,21 @@ namespace caffe2 {
 struct CudaEventWrapper {
   explicit CudaEventWrapper(const DeviceOption& option)
       : cuda_stream_(nullptr),
-        cuda_gpu_id_(option.cuda_gpu_id()),
+        device_id_(option.device_id()),
         status_(EventStatus::EVENT_INITIALIZED) {
     CAFFE_ENFORCE(option.device_type(), PROTO_CUDA);
-    DeviceGuard g(cuda_gpu_id_);
+    DeviceGuard g(device_id_);
     CUDA_ENFORCE(cudaEventCreate(
         &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
   }
   ~CudaEventWrapper() {
-    DeviceGuard g(cuda_gpu_id_);
+    DeviceGuard g(device_id_);
     CUDA_CHECK(cudaEventDestroy(cuda_event_));
   }
 
   cudaEvent_t cuda_event_;
   cudaStream_t cuda_stream_;
-  int cuda_gpu_id_;
+  int device_id_;
 
   std::atomic<int> status_;
   std::mutex mutex_recorded_;
@@ -65,12 +65,12 @@ void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
       const auto& current_device = CaffeCudaGetDevice();
       CAFFE_ENFORCE_EQ(
           current_device,
-          wrapper->cuda_gpu_id_,
+          wrapper->device_id_,
           "When you call EventRecordCUDA, your current device should be the same "
           "as the device specified by the event.");
       CAFFE_ENFORCE_EQ(
           current_device,
-          static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+          static_cast<const CUDAContext*>(context)->device_id());
       CUDA_ENFORCE(cudaEventRecord(
           wrapper->cuda_event_,
           static_cast<const CUDAContext*>(context)->cuda_stream()));
@@ -96,7 +96,7 @@ void EventFinishCUDA(const Event* event) {
 
   if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
     // ok, even if event is already completed and status was not yet updated
-    DeviceGuard g(wrapper->cuda_gpu_id_);
+    DeviceGuard g(wrapper->device_id_);
     auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
     if (cudaResult == cudaSuccess) {
       wrapper->status_ = EventStatus::EVENT_SUCCESS;
@@ -127,7 +127,7 @@ void EventWaitCUDACUDA(const Event* event, void* context) {
     if (context_stream != event_stream) {
       // CAFFE_ENFORCE_EQ(
       //    CaffeCudaGetDevice(),
-      //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+      //    static_cast<const CUDAContext*>(context)->device_id());
       CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
     }
   }
diff --git a/caffe2/core/hip/event_hip.cc b/caffe2/core/hip/event_hip.cc
index 6f0db4642ddbba..ebec9c593e6eee 100644
--- a/caffe2/core/hip/event_hip.cc
+++ b/caffe2/core/hip/event_hip.cc
@@ -138,7 +138,7 @@ void EventWaitHIPHIP(const Event* event, void* context)
         {
             // CAFFE_ENFORCE_EQ(
             //    CaffeCudaGetDevice(),
-            //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+            //    static_cast<const CUDAContext*>(context)->device_id());
             HIP_CHECK(hipStreamWaitEvent(context_stream, wrapper->hip_event_, 0));
         }
     }
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index d9816e787ba88c..87633fadebe34e 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -176,7 +176,7 @@ class ComputeBlobRecyclingForDag {
         // cuda device option but whose inputs/outputs are on CPU
         if (net.op(op_index).type() == "CopyGPUToCPU") {
           blob_device_[output].set_device_type(0);
-          blob_device_[output].set_cuda_gpu_id(0);
+          blob_device_[output].set_device_id(0);
         }
       }
     }
@@ -478,7 +478,7 @@ class ComputeBlobRecyclingForDag {
       const DeviceOption& device_option) {
     const DeviceOption& blob_device = blob_device_[blob_name];
     if (device_option.device_type() != blob_device.device_type() ||
-        device_option.cuda_gpu_id() != blob_device.cuda_gpu_id()) {
+        device_option.device_id() != blob_device.device_id()) {
       return false;
     }
     for (const int token : req_tokens_[blob_name]) {
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index ce5fdbe7b7ed80..a694a4865c6cb3 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -157,7 +157,7 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
         numa_node_id);
     return pool_getter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
   } else if (device_option.device_type() == PROTO_CUDA) {
-    auto gpu_id = device_option.cuda_gpu_id();
+    auto gpu_id = device_option.device_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
         "Invalid GPU id: " + caffe2::to_string(gpu_id));
@@ -173,7 +173,7 @@ int AsyncNetBase::stream(int task_id) {
   const auto& device_option = event(task_id).GetDeviceOption();
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.cuda_gpu_id();
+    int gpu_id = device_option.device_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= getStreamCounters().size()) {
       getStreamCounters().resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 550a760826edd8..86d0b4d1d271dc 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -112,7 +112,7 @@ AsyncDAGNet::AsyncDAGNet(
 int AsyncDAGNet::stream(const DeviceOption& device_option) {
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.cuda_gpu_id();
+    int gpu_id = device_option.device_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_gpu_test.cc b/caffe2/core/net_gpu_test.cc
index eaea9377f9bcac..fab56112ec227c 100644
--- a/caffe2/core/net_gpu_test.cc
+++ b/caffe2/core/net_gpu_test.cc
@@ -124,7 +124,7 @@ TEST(NetTest, DISABLED_ChainingForDifferentDevices) {
           type: "NetTestDummy"
           device_option {
             device_type: 1
-            cuda_gpu_id: 1
+            device_id: 1
           }
         }
 )DOC";
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 79be08c03b2325..8115ae3aab6a3c 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -649,7 +649,7 @@ std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
           &blob_device);
 
       if (blob_device.device_type() == PROTO_CUDA &&
-          blob_device.cuda_gpu_id() != op_device.cuda_gpu_id()) {
+          blob_device.device_id() != op_device.device_id()) {
         mismatches[blob_name] = std::make_pair(op_device, blob_device);
       } else if (
           blob_device.device_type() == PROTO_HIP &&
diff --git a/caffe2/mkl/utils/mkl_memory.cc b/caffe2/mkl/utils/mkl_memory.cc
index 3f05f9c5d24bde..9d4f347a13cb81 100644
--- a/caffe2/mkl/utils/mkl_memory.cc
+++ b/caffe2/mkl/utils/mkl_memory.cc
@@ -26,7 +26,7 @@ static vector<int64_t> GetMKLTensorInfo(
   const mkl::MKLMemory<T>* tc = static_cast<const mkl::MKLMemory<T>*>(c);
   *capacity = tc->size() * sizeof(T);
   device->set_device_type(PROTO_MKLDNN);
-  device->set_cuda_gpu_id(0);
+  device->set_device_id(0);
   return tc->dims();
 }
 
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index bf4e20b7904711..5bd9b0a11b0921 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -70,7 +70,7 @@ void ProfileOperatorObserver::Start() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->cuda_gpu_id());
+    cudaSetDevice(context->device_id());
     cudaEventCreate(&start_);
     cudaEventRecord(start_, context->cuda_stream());
 
@@ -92,7 +92,7 @@ void ProfileOperatorObserver::Stop() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->cuda_gpu_id());
+    cudaSetDevice(context->device_id());
     cudaEventCreate(&stop_);
     cudaEventRecord(stop_, context->cuda_stream());
     cudaEventSynchronize(stop_);
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 2350910febff27..8a21fa0acf679c 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -65,7 +65,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
       {DeviceType::CUDA, caffe2::DeviceType::CUDA}};
   caffe2::DeviceOption d;
   d.set_device_type(static_cast<int32_t>(m.at(onnx_device.type)));
-  d.set_cuda_gpu_id(onnx_device.device_id);
+  d.set_device_id(onnx_device.device_id);
   return d;
 }
 
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
index cd70e9c2b5df2f..8458fab901ed8b 100644
--- a/caffe2/operators/load_save_op_gpu.cc
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -8,7 +8,7 @@ void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
   if (proto->has_tensor()) {
     auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
     device_detail->set_device_type(PROTO_CUDA);
-    device_detail->set_cuda_gpu_id(CaffeCudaGetDevice());
+    device_detail->set_device_id(CaffeCudaGetDevice());
   }
 }
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
index e16e2073f7fd12..061f54d3a4cb0e 100644
--- a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
@@ -72,11 +72,11 @@ void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
       if (gpu_id == -1 &&
           rnn_op.op->device_option().device_type() ==
               DeviceTypeProto::PROTO_CUDA) {
-        gpu_id = rnn_op.op->device_option().cuda_gpu_id();
+        gpu_id = rnn_op.op->device_option().device_id();
       } else {
         CAFFE_ENFORCE(
             rnn_op.op->device_option().device_type() == 0 ||
-                rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
+                rnn_op.op->device_option().device_id() == gpu_id,
             "RNN Executor only supports ops on one GPU");
       }
 
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 71870010293492..21bdec2c6883b1 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -135,7 +135,7 @@ message DeviceOption {
   // optional DeviceType device_type = 1 [ default = CPU ];
   optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
   // [CUDA specific] the cuda gpu id.
-  optional int32 cuda_gpu_id = 2;
+  optional int32 device_id = 2;
   // [general] The random seed to start the device random number generator with.
   optional uint32 random_seed = 3;
   // [general] What node this op should execute on.
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index f927020e6ae88f..f9ccf92d75099b 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -236,5 +236,5 @@ def CPU(self):
     def GPU(self, gpu_id=0):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = gpu_id
+        device_option.device_id = gpu_id
         return device_option
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 6850c02fc13964..4f683daa368240 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -84,7 +84,7 @@ def IsOperatorWithEngine(op_type, engine):
 
 def DeviceOption(
     device_type,
-    cuda_gpu_id=0,
+    device_id=0,
     random_seed=None,
     node_name=None,
     numa_node_id=None,
@@ -92,7 +92,7 @@ def DeviceOption(
 ):
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
-    option.cuda_gpu_id = cuda_gpu_id
+    option.device_id = device_id
     if node_name is not None:
         option.node_name = node_name
     if random_seed is not None:
@@ -115,7 +115,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr
     if not opt1.device_type or not opt2.device_type:
         # At least one option is for CPU, check if both are for CPU.
         return not opt1.device_type and not opt2.device_type
-    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
+    return opt1.device_id == opt2.device_id
 
 
 def InferBlobDevices(net):
@@ -2111,7 +2111,7 @@ def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = gpu_id
+        device_option.device_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
@@ -2286,7 +2286,7 @@ def copy_func_between_devices(src, dst):
         return None
 
     if src.device_type == CUDA and dst.device_type == CUDA:
-        if src.cuda_gpu_id == dst.cuda_gpu_id:
+        if src.device_id == dst.device_id:
             return None
         else:
             def fun(net, *args, **kw):
@@ -2312,10 +2312,10 @@ def fun(net, *args, **kw):
 def device_equal(src, dst):
     '''
     We are using this fucntion instead of == operator because optional-value
-    comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
+    comparison between empty device_options and {device_type:0, device_id:0}
     returns not equal in some cases.
     '''
-    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
+    return src.device_type == dst.device_type and src.device_id == dst.device_id
 
 
 def update_placeholder_op_output(op, blob_to_device):
@@ -2429,7 +2429,7 @@ def _gen_new_name(blob, device_option):
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
                         elif device_option.device_type == CUDA:
-                            suffix = '_cuda_' + str(device_option.cuda_gpu_id)
+                            suffix = '_cuda_' + str(device_option.device_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 7120843f33152d..2f6dedbfd80c83 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -83,17 +83,17 @@ def testDeviceScope(self):
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
             self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.device_option.device_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
             override_device.device_type = caffe2_pb2.CPU
@@ -109,13 +109,13 @@ def testDeviceScope(self):
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
                 self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-                self.assertEqual(op.device_option.cuda_gpu_id, 1)
+                self.assertEqual(op.device_option.device_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
                 self.assertEqual(len(op.output), 1)
@@ -255,7 +255,7 @@ class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
             control_input="z", device_option=device_option,
@@ -271,7 +271,7 @@ def testCreate(self):
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertTrue(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
@@ -574,7 +574,7 @@ def test_check_equal_default_value(self):
         opt2 = caffe2_pb2.DeviceOption()
         opt1.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.cuda_gpu_id = 5
+        opt1.device_id = 5
         # opt1 still is on CPU, so the options should be equal
         self.assertTrue(core.device_option_equal(opt1, opt2))
         opt2.device_type = 0
@@ -649,7 +649,7 @@ class TestInferDevice(test_util.TestCase):
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         self.cuda_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
@@ -748,7 +748,7 @@ def test_inject_copy(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
 
@@ -765,7 +765,7 @@ def test_inject_copy(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
         self.assertNotEqual(blob_to_device["fc_w"], device_option)
@@ -775,7 +775,7 @@ def test_cross_nets(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
         const = init_net.ConstantFill([], 'const', shape=[], value=1.)
@@ -791,12 +791,12 @@ def test_cross_nets(self):
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "fc_w_cuda_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "fc_b_cuda_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
@@ -804,7 +804,7 @@ def test_cross_nets(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
@@ -822,7 +822,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -832,7 +832,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -844,7 +844,7 @@ def test_cross_nets(self):
   type: "FC"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -855,7 +855,7 @@ def test_cross_nets(self):
   type: "Add"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 external_input: "data"
@@ -870,7 +870,7 @@ def test_cross_nets_no_change(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
 
         with core.DeviceScope(device_option):
             weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
@@ -887,7 +887,7 @@ def test_cross_nets_no_change(self):
         self.assertEqual(op.input[1], "fc_w")
         self.assertEqual(op.input[2], "fc_b")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         """
 For reference, net.Proto() should be like:
 name: ""
@@ -900,7 +900,7 @@ def test_cross_nets_no_change(self):
   type: "FC"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 external_input: "data"
@@ -912,7 +912,7 @@ def test_inject_copy_multi_use(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
 
         with core.DeviceScope(device_option):
             net.Relu("data", "relu1")
@@ -920,10 +920,10 @@ def test_inject_copy_multi_use(self):
         with core.DeviceScope(device_option):
             net.Relu("data", "relu3")
         net.Relu("data", "relu4")
-        device_option.cuda_gpu_id = 0
+        device_option.device_id = 0
         with core.DeviceScope(device_option):
             net.Relu("data", "relu5")
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         with core.DeviceScope(device_option):
             net.Relu("data", "relu6")
 
@@ -931,12 +931,12 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "data_cuda_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
         self.assertEqual(op.type, "Relu")
@@ -945,7 +945,7 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
@@ -955,18 +955,18 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "data_cuda_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.input[0], "data_cuda_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu6")
         """
@@ -979,7 +979,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -989,7 +989,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -1005,7 +1005,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -1021,7 +1021,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 0
+    device_id: 0
   }
 }
 op {
@@ -1031,7 +1031,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 0
+    device_id: 0
   }
 }
 op {
@@ -1041,7 +1041,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 external_input: "data"
@@ -1060,7 +1060,7 @@ def test_inject_copy_placeholder_ops(self):
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
             gpu_device[i].device_type = caffe2_pb2.CUDA
-            gpu_device[i].cuda_gpu_id = 0
+            gpu_device[i].device_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
         recv_node = 'node:1'
@@ -1100,12 +1100,12 @@ def test_inject_copy_placeholder_ops(self):
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
         self.assertEqual(op.type, placeholder_send)
@@ -1128,7 +1128,7 @@ def test_blob_inplace(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
         with core.DeviceScope(device_option):
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 89770dc6ea7d9a..749c8b12c930e8 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -813,7 +813,7 @@ def builder_fun(model):
 
     device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
 
-    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
+    namescope = "{}_{}/".format(device_prefix, device.device_id)
     for op in mnet.Proto().op:
         if "RecurrentNetwork" in op.type:
             raise("RecurrentNetwork conversion not yet supported")
@@ -1540,7 +1540,7 @@ def _AnalyzeOperators(model):
             continue
 
         op_dev = op.device_option
-        op_gpu = op_dev.cuda_gpu_id
+        op_gpu = op_dev.device_id
 
         # This avoids failing on operators that are only for CPU
         if op_dev.device_type != caffe2_pb2.CUDA:
@@ -1904,7 +1904,7 @@ def _InterleaveOps(model):
     new_ops = []
     ops = {d: [] for d in range(num_devices)}
     for op in orig_ops:
-        ops[op.device_option.cuda_gpu_id].append(op)
+        ops[op.device_option.device_id].append(op)
 
     for j in range(num_ops_per_dev):
         tp = None
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 5cc18f99bd9eb9..8470df1588717f 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -259,7 +259,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
-    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
+    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, device_id=i)
      for i in range(workspace.NumCudaDevices())]
     if workspace.has_gpu_support else [])
 
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index f8e3f32bb2c225..1e881d27f49dc8 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -596,7 +596,7 @@ def rename_list(proto_list):
                             rename_list(step_op.output)
                             if device is not None:
                                 step_op.device_option.device_type = device.device_type
-                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                                step_op.device_option.device_id = device.device_id
 
                         rename_list(arg.n.external_input)
                         rename_list(arg.n.external_output)
@@ -610,7 +610,7 @@ def rename_list(proto_list):
 
             if device is not None:
                 op.device_option.device_type = device.device_type
-                op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                op.device_option.device_id = device.device_id
             validate_op(op)
             predict_proto.op.extend([op])
             known_blobs.update(op.output)
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index b407f96d2391f8..2f2b5aced6640e 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -26,7 +26,7 @@ def OnGPU(gpu_id):
   """
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.cuda_gpu_id = gpu_id
+    device_option.device_id = gpu_id
     return device_option
 
 
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 4b5cddb61d244e..7583f863b1f5ad 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -268,11 +268,11 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False):
 
 def format_device_option(dev_opt):
     if not dev_opt or not (
-            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
+            dev_opt.device_type or dev_opt.device_id or dev_opt.node_name):
         return None
     return call(
         'DeviceOption',
-        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
+        [dev_opt.device_type, dev_opt.device_id, "'%s'" % dev_opt.node_name])
 
 
 @Printer.register(OperatorDef)
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
index 8d3a362dcdf725..3178345cf46e21 100644
--- a/caffe2/python/numa_test.py
+++ b/caffe2/python/numa_test.py
@@ -27,7 +27,7 @@ def build_test_net(net_name):
 
     gpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_option.device_type = caffe2_pb2.CUDA
-    gpu_device_option.cuda_gpu_id = 0
+    gpu_device_option.device_id = 0
 
     net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
                         device_option=gpu_device_option)
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index 8cc3f9e2fa98eb..5802e49de526dc 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -24,7 +24,7 @@ def __init__(self, init_net, predict_net, workspace, uninitialized):
     @property
     def _name_scope(self):
         if self.predict_net.device_option.device_type == caffe2_pb2.CUDA:
-            return 'gpu_{}'.format(self.predict_net.device_option.cuda_gpu_id)
+            return 'gpu_{}'.format(self.predict_net.device_option.device_id)
         return ''
 
     def run(self, inputs, **kwargs):
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index 07f378beb18ff0..b90a7f84b4ed8a 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -89,7 +89,7 @@ def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll):
                     self.assertEqual(proto.tensor.device_detail.device_type,
                                      device_type)
                     if device_type == caffe2_pb2.CUDA:
-                        self.assertEqual(proto.tensor.device_detail.cuda_gpu_id,
+                        self.assertEqual(proto.tensor.device_detail.device_id,
                                          gpu_id)
 
             blobs = [str(i) for i in range(len(arrays))]
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 9d9bb38e178517..66ac07dbdca079 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1216,7 +1216,7 @@ def test_lstm_extract_predictor_net(self):
                     if arg.name == "step_net":
                         for step_op in arg.n.op:
                             self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
+                            self.assertEqual(1, step_op.device_option.device_id)
                     elif arg.name == 'backward_step_net':
                         self.assertEqual(caffe2_pb2.NetDef(), arg.n)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 482d16a0dfa6a6..5454b8cd4d3fba 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -81,7 +81,7 @@ def make_unique_blob_name(self, base_str):
 
         if current_scope.device_type == caffe2_pb2.CUDA:
             return self.get_gpu_blob_name(
-                base_str, current_scope.cuda_gpu_id, current_scope.node_name
+                base_str, current_scope.device_id, current_scope.node_name
             )
         else:
             return self.get_cpu_blob_name(base_str, current_scope.node_name)
@@ -277,7 +277,7 @@ def _run(self, net, param_init_net, param_info):
         # to include device information.
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
+            "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
             shape=[1],
             value=1.0
         )
@@ -486,12 +486,12 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.device_id),
             shape=[1],
             value=1.0
         )
         WD = param_init_net.ConstantFill(
-            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            [], "wd_{}_{}".format(dev.device_type, dev.device_id),
             shape=[1], value=self.weight_decay
         )
 
@@ -1158,7 +1158,7 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.device_id),
             shape=[1],
             value=1.0
         )
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
index b4c71535debe66..ef11246bdfcc9b 100644
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -193,7 +193,7 @@ def test_load_device_scope(self):
 
         # check device options
         for op in list(init_net.Proto().op) + list(predict_init_net.Proto().op):
-            self.assertEqual(1, op.device_option.cuda_gpu_id)
+            self.assertEqual(1, op.device_option.device_id)
             self.assertEqual(caffe2_pb2.CPU, op.device_option.device_type)
 
     def test_db_fails_without_params(self):
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 679152c788132e..6db4ae42b84742 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -34,7 +34,7 @@ class DLPackWrapper {
         "Unsupported device type: ",
         device_option.device_type());
     tensor_context.device_type = *device_type_ptr;
-    tensor_context.device_id = device_option.cuda_gpu_id();
+    tensor_context.device_id = device_option.device_id();
 
     if (tensor->size() <= 0) {
       tensor->Resize(0);
@@ -87,7 +87,7 @@ class DLPackWrapper {
     int dlpack_device_id = dlTensor->ctx.device_id;
     CAFFE_ENFORCE_EQ(
         dlpack_device_id,
-        device_option.cuda_gpu_id(),
+        device_option.device_id(),
         "Expected same device id for DLPack and C2 tensors");
 
     std::vector<int64_t> dims;
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index dc8e088eba97c5..dd80282238a80b 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -30,7 +30,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
     case PROTO_CPU:
       return option.numa_node_id();
     case PROTO_CUDA:
-      return option.cuda_gpu_id();
+      return option.device_id();
     case PROTO_MKLDNN:
       return option.numa_node_id();
     case PROTO_HIP:
@@ -43,7 +43,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
 C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
   return (
       lhs.device_type() == rhs.device_type() &&
-      lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
+      lhs.device_id() == rhs.device_id() &&
       lhs.hip_gpu_id() == rhs.hip_gpu_id() &&
       lhs.node_name() == rhs.node_name() &&
       lhs.numa_node_id() == rhs.numa_node_id());
diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
index c9f37f4c98c290..5d8fb86b34e3bb 100644
--- a/caffe2/utils/proto_utils_test.cc
+++ b/caffe2/utils/proto_utils_test.cc
@@ -11,9 +11,9 @@ TEST(ProtoUtilsTest, IsSameDevice) {
   EXPECT_FALSE(IsSameDevice(a, b));
   b.set_node_name("my_node");
   EXPECT_TRUE(IsSameDevice(a, b));
-  b.set_cuda_gpu_id(2);
+  b.set_device_id(2);
   EXPECT_FALSE(IsSameDevice(a, b));
-  a.set_cuda_gpu_id(2);
+  a.set_device_id(2);
   EXPECT_TRUE(IsSameDevice(a, b));
   a.set_device_type(DeviceTypeProto::PROTO_CUDA);
   b.set_device_type(DeviceTypeProto::PROTO_CPU);
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 113403fd87bbf4..3a98a4cb7d9f3e 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -2216,7 +2216,7 @@
     "CURAND_ENFORCE" :("HIPRAND_ENFORCE", API_CAFFE2),
     "curandGenerateUniform" : ("hiprandGenerateUniform", API_CAFFE2),
     "curand_generator" : ("hiprand_generator", API_CAFFE2),
-    "cuda_gpu_id" : ("hip_gpu_id", API_CAFFE2),
+    "device_id" : ("hip_gpu_id", API_CAFFE2),
     "CaffeCudaGetDevice" : ("CaffeHipGetDevice", API_CAFFE2),
 }
 

From bbae57d06ee65d81522464e5426b76b6e9ed30e9 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 20:33:11 -0700
Subject: [PATCH 36/82] Move TensorImpl size_from_dim, size_to_dim,
 size_between_dim, canonical_axis_index to caffe2::Tensor (#12099)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12099

- Generalize the free functions to accept IntList, not just std::vector<int64_t>

Reviewed By: jerryzh168

Differential Revision: D10051365

fbshipit-source-id: e3d571bf8fead22f6f25c3ca46f0c38c2bb065d2
---
 aten/src/ATen/core/TensorImpl.h | 43 ++++++---------------------------
 caffe2/core/tensor.h            | 19 ++++++++++++---
 2 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 7552bfa16d8fb1..47e4f444d0e702 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -48,14 +48,14 @@ class Tensor;
 /**
  * A utility function to convert vector<int> to vector<int64_t>.
  */
-inline std::vector<int64_t> ToVectorint64_t(const std::vector<int>& src) {
+inline std::vector<int64_t> ToVectorint64_t(ArrayRef<int> src) {
   return std::vector<int64_t>(src.begin(), src.end());
 }
 
 /**
  * Return product of all dimensions starting from k
  */
-inline int64_t size_from_dim_(int k, const std::vector<int64_t>& dims) {
+inline int64_t size_from_dim_(int k, IntList dims) {
   int64_t r = 1;
   for (size_t i = k; i < dims.size(); ++i) {
     r *= dims[i];
@@ -64,7 +64,7 @@ inline int64_t size_from_dim_(int k, const std::vector<int64_t>& dims) {
 }
 
 // Product of all dims up to k (not including dims[k])
-inline int64_t size_to_dim_(int k, const std::vector<int64_t>& dims) {
+inline int64_t size_to_dim_(int k, IntList dims) {
   CAFFE_ENFORCE((unsigned)k <= dims.size());
   int64_t r = 1;
   for (int i = 0; i < k; ++i) {
@@ -74,7 +74,7 @@ inline int64_t size_to_dim_(int k, const std::vector<int64_t>& dims) {
 }
 
 // Product of all dims between k and l (not including dims[k] and dims[l])
-inline int64_t size_between_dim_(int k, int l, const std::vector<int64_t>& dims) {
+inline int64_t size_between_dim_(int k, int l, IntList dims) {
   CAFFE_ENFORCE((unsigned)l < dims.size());
   int64_t r = 1;
   if (k < l) {
@@ -785,45 +785,16 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
   }
 
-  // NB: This capacity may also include available space
-  // in the storage BEFORE the tensor data, if storage_offset != 0
-  inline size_t capacity_nbytes() const {
-    return storage_.capacity();
-  }
   /**
    * Returns the dimensions of the tensor as a vector.
    */
   inline const std::vector<int64_t>& dims() const {
+    // TODO: This method will no longer work if we change the
+    // internal representation of dims().  That's BAD.  Let's get
+    // people to stop using this.
     return sizes_;
   }
 
-  inline int64_t size_from_dim(int k) const {
-    return size_from_dim_(k, sizes_);
-  }
-
-  inline int64_t size_to_dim(int k) const {
-    return size_to_dim_(k, sizes_);
-  }
-
-  inline int64_t size_between_dim(int k, int l) const {
-    return size_between_dim_(k, l, sizes_);
-  }
-
-  /**
-   * Returns the 'canonical' version of a (usually)  user-specified axis,
-   * allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param axis_index the axis index.
-   *        If 0 <= index < dim(), return index.
-   *        If -ndim <= index <= -1, return (dim() - (-index)),
-   *        e.g., the last axis index (dim() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int canonical_axis_index(int axis_index) const {
-    return canonical_axis_index_(axis_index, dim());
-  }
-
   /**
    * Checks if the tensor content is of the given data type.
    */
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 8795ba379eeb68..b057d92aa4a62d 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -347,19 +347,30 @@ class CAFFE2_API Tensor final {
   }
 
   inline int64_t size_from_dim(int k) const {
-    return impl_.get()->size_from_dim(k);
+    return size_from_dim_(k, impl_->sizes());
   }
 
   inline int64_t size_to_dim(int k) const {
-    return impl_.get()->size_to_dim(k);
+    return size_to_dim_(k, impl_->sizes());
   }
 
   inline int64_t size_between_dim(int k, int l) const {
-    return impl_.get()->size_between_dim(k, l);
+    return size_between_dim_(k, l, impl_->sizes());
   }
 
+  /**
+   * Returns the 'canonical' version of a (usually)  user-specified axis,
+   * allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param axis_index the axis index.
+   *        If 0 <= index < dim(), return index.
+   *        If -ndim <= index <= -1, return (dim() - (-index)),
+   *        e.g., the last axis index (dim() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
   inline int canonical_axis_index(int axis_index) const {
-    return impl_.get()->canonical_axis_index(axis_index);
+    return canonical_axis_index_(axis_index, impl_->dim());
   }
 
   inline int64_t stride(int64_t dim) const {

From f5a0c337ba8354b7f314abea4fe6037120f49e3f Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 27 Sep 2018 20:33:13 -0700
Subject: [PATCH 37/82] Move TensorImpl IsType, meta, dim32, dim,
 ExtractDeviceOption to caffe2::Tensor

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12100

Reviewed By: jerryzh168

Differential Revision: D10051424

fbshipit-source-id: 5986e92ea54e60ec6bfe992015a05e09288c948c
---
 aten/src/ATen/core/TensorImpl.h | 57 +++------------------------------
 caffe2/core/tensor.h            | 31 +++++++++++++++---
 2 files changed, 30 insertions(+), 58 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 47e4f444d0e702..d16e5a83cb6683 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -406,12 +406,12 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     if ((void*)&src == (void*)this) {
       return;
     }
-    if (data_type_ != src.meta()) {
+    if (data_type_ != src.dtype()) {
       CAFFE_ENFORCE_WITH_CALLER(
           src.is_contiguous(),
           "Right now only copy of contiguous source Tensor is supported.");
-      storage_ = at::Storage(device_type(), src.meta());
-      data_type_ = src.meta();
+      storage_ = at::Storage(device_type(), src.dtype());
+      data_type_ = src.dtype();
     }
     if (src.numel() == -1) {
       sizes_.clear();
@@ -774,7 +774,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    */
   template <typename T>
   inline T* mutable_data() {
-    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
+    if ((numel_ == 0 || storage_.data()) && storage_.IsType<T>()) {
       return static_cast<T*>(storage_.data()) + storage_offset_;
     }
     // Check it here statically - otherwise TypeMeta would throw the runtime
@@ -795,55 +795,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return sizes_;
   }
 
-  /**
-   * Checks if the tensor content is of the given data type.
-   */
-  template <typename T>
-  inline bool IsType() const {
-    return storage_.IsType<T>();
-  }
-  /**
-   * Returns the TypeMeta object associated with the current data type.
-   */
-  inline const caffe2::TypeMeta& meta() const {
-    return data_type_;
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor in int.
-   *
-   * This function returns an int value instead of int64_t, which depending on
-   * the typedef could be int64. If you want int64 dim values, make sure you
-   * call dim() instead.
-   */
-  inline int dim32(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast<int>(sizes_.size()), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    CAFFE_ENFORCE_LT_WITH_CALLER(sizes_[i], std::numeric_limits<int>::max());
-    return static_cast<int>(sizes_[i]);
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor. Note that the passed in index
-   * must be between 0 (inclusive) and the number of dimensions, otherwise
-   * this function will produce a fatal message.
-   */
-  inline int64_t dim(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast<int>(sizes_.size()), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    return sizes_[i];
-  }
-
-  void ExtractDeviceOption(caffe2::DeviceOption* device) const {
-    auto* context = GetStaticContext();
-    CHECK(context);
-    context->ExtractDeviceOption(device, data());
-  }
-
  protected:
   // we decide to keep reserved_ and it will
   // live in Tensor after the split
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index b057d92aa4a62d..0563221feb2e83 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -385,25 +385,46 @@ class CAFFE2_API Tensor final {
     return impl_.get()->is_contiguous();
   }
 
+  /**
+   * Checks if the tensor content is of the given data type.
+   */
   template <typename T>
   inline bool IsType() const {
-    return impl_.get()->IsType<T>();
+    return impl_->storage().IsType<T>();
   }
 
+  /**
+   * Returns the TypeMeta object associated with the current data type.
+   */
   inline const TypeMeta& meta() const {
-    return impl_.get()->meta();
+    return impl_->dtype();
   }
 
+  /**
+   * Returns the i-th dimension of the tensor in int.
+   *
+   * This function returns an int value instead of int64_t, which depending on
+   * the typedef could be int64. If you want int64 dim values, make sure you
+   * call dim() instead.
+   */
   inline int dim32(const int i) const {
-    return impl_.get()->dim32(i);
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast<int>(impl_->dim()), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    auto s = impl_->size(i);
+    CAFFE_ENFORCE_LT_WITH_CALLER(s, std::numeric_limits<int>::max());
+    return static_cast<int>(s);
   }
 
   inline int64_t dim(const int i) const {
-    return impl_.get()->dim(i);
+    return impl_->size(i);
   }
 
   inline void ExtractDeviceOption(DeviceOption* device) const {
-    return impl_.get()->ExtractDeviceOption(device);
+    auto* context = GetStaticContext();
+    CHECK(context);
+    context->ExtractDeviceOption(device, impl_->data());
   }
 
   const Storage& storage() {

From 04c0971679374148aa4105e2e998de6478eca1eb Mon Sep 17 00:00:00 2001
From: Satish Nadathur <nrsatish@fb.com>
Date: Thu, 27 Sep 2018 21:10:36 -0700
Subject: [PATCH 38/82] Special case BatchGather and BatchGatherGradient for
 block_size=1. (#11349)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11349

Special case BatchGather and BatchGatherGradient for block_size=1. This makes BatchGather 3-4X faster and BatchGatherGradient 10X for this case.

Reviewed By: jspark1105, ilia-cher

Differential Revision: D7218043

fbshipit-source-id: ea12042239a8adc92b9efcbd0b66e354fb43f4c7
---
 caffe2/operators/batch_gather_ops.h           | 90 +++++++++++++------
 .../python/operator_test/gather_ops_test.py   |  3 +-
 2 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h
index 2b9e4d6d5e6ecc..07ee6187443a97 100644
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@@ -35,31 +35,52 @@ class BatchGatherOp final : public Operator<Context> {
     auto block_size = data.size_from_dim(2);
     auto block_bytesize = block_size * data.meta().itemsize();
     auto N = indices.size();
-    auto data_batch_bytesize = data.size_from_dim(1) * data.meta().itemsize();
-    auto gathered_batch_bytesize =
-        N * data.size_from_dim(2) * data.meta().itemsize();
+    auto data_batch_size = data.size_from_dim(1);
+    auto gathered_batch_size = N * data.size_from_dim(2);
+    auto data_batch_bytesize = data_batch_size * data.meta().itemsize();
+    auto gathered_batch_bytesize = gathered_batch_size * data.meta().itemsize();
     const TInd* idxs = indices.template data<TInd>();
     auto src_base = static_cast<const char*>(data.raw_data());
     auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
 
-    for (auto batch = 0; batch < data.dim(0); ++batch) {
-      for (auto i = 0; i < N; ++i) {
-        auto idx = idxs[i];
-        CAFFE_ENFORCE(
-            0 <= idx && idx < data.dim(1),
-            "INDICES element is out of DATA bounds, id=",
-            idx,
-            " data_dim=",
-            data.dim(1));
-        auto src =
-            src_base + idx * block_bytesize + batch * data_batch_bytesize;
-        auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize;
-        context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
+    for (auto i = 0; i < N; ++i) {
+      auto idx = idxs[i];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data.dim(1),
+          "INDICES element is out of DATA bounds, id=",
+          idx,
+          " data_dim=",
+          data.dim(1));
+    }
+
+    if (data.template IsType<float>() && block_size == 1) {
+      auto src = data.template data<float>();
+      auto dst = output->template mutable_data<float>();
+
+      for (auto batch = 0; batch < data.dim(0); ++batch) {
+        auto src_batch_base = src + batch * data_batch_size;
+        auto out_batch_base = dst + batch * gathered_batch_size;
+
+        for (auto i = 0; i < N; ++i) {
+          auto idx = idxs[i];
+          out_batch_base[i] = src_batch_base[idx];
+        }
+      }
+    } else {
+      for (auto batch = 0; batch < data.dim(0); ++batch) {
+        auto src_batch_base = src_base + batch * data_batch_bytesize;
+        auto out_batch_base = out + batch * gathered_batch_bytesize;
+
+        for (auto i = 0; i < N; ++i) {
+          auto idx = idxs[i];
+          auto src = src_batch_base + idx * block_bytesize;
+          auto dst = out_batch_base + i * block_bytesize;
+          context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
+        }
       }
     }
     return true;
   }
-
   INPUT_TAGS(DATA, INDICES);
 };
 
@@ -108,21 +129,32 @@ class BatchGatherGradientOp final : public Operator<Context> {
     auto gathered_batch_size = N * data.size_from_dim(2);
     const TInd* idxs = indices.template data<TInd>();
 
+    for (auto i = 0; i < N; ++i) {
+      auto idx = idxs[i];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data.dim(1),
+          "INDICES element is out of DATA bounds, id=",
+          idx,
+          " data_dim=",
+          data.dim(1));
+    }
+
     for (auto batch = 0; batch < grad.dim(0); ++batch) {
+      auto src_batch_base = grad_data + batch * gathered_batch_size;
+      auto out_batch_base = out_data + batch * data_batch_size;
+
       for (auto i = 0; i < N; ++i) {
         auto idx = idxs[i];
-        CAFFE_ENFORCE(
-            0 <= idx && idx < data.dim(1),
-            "INDICES element is out of DATA bounds, id=",
-            idx,
-            " data_dim=",
-            data.dim(1));
-        math::Add(
-            block_size,
-            out_data + idx * block_size + batch * data_batch_size,
-            grad_data + i * block_size + batch * gathered_batch_size,
-            out_data + idx * block_size + batch * data_batch_size,
-            &context_);
+        if (block_size == 1) {
+          out_batch_base[idx * block_size] += src_batch_base[i * block_size];
+        } else {
+          math::Add(
+              block_size,
+              out_batch_base + idx * block_size,
+              src_batch_base + i * block_size,
+              out_batch_base + idx * block_size,
+              &context_);
+        }
       }
     }
     return true;
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index d5ab8e58cec0f2..5d05c1e5b23b91 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -39,10 +39,11 @@ def _inputs(draw):
     rows_num = draw(st.integers(1, 100))
     index_num = draw(st.integers(1, 10))
     batch_size = draw(st.integers(2, 10))
+    block_size = draw(st.integers(1, 2))
     return (
         draw(hnp.arrays(
             np.float32,
-            (batch_size, rows_num, 2),
+            (batch_size, rows_num, block_size),
             elements=st.floats(-10.0, 10.0),
         )),
         draw(hnp.arrays(

From d291cf7de6487ea351ba015555ebb2dd2c660370 Mon Sep 17 00:00:00 2001
From: Jeff Smith <jeffksmith@fb.com>
Date: Fri, 28 Sep 2018 07:09:31 -0700
Subject: [PATCH 39/82] Ensuring positive definite matrix before constructing
 (#12102)

Summary:
Ensuring positive definite matrix in Multivariate Normal Distribution
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12102

Reviewed By: ezyang, Balandat

Differential Revision: D10052091

Pulled By: jeffreyksmithjr

fbshipit-source-id: 276cfc6995f6a217a5ad9eac299445ff1b67a65f
---
 test/test_distributions.py                 |  6 ++++++
 torch/distributions/multivariate_normal.py | 14 ++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/test/test_distributions.py b/test/test_distributions.py
index 5fbc2003be27e4..69aa6cd73b465e 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -542,6 +542,12 @@ def is_all_nan(tensor):
             'scale': torch.tensor([1., -1.], requires_grad=True),
         },
     ]),
+    Example(MultivariateNormal, [
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'covariance_matrix': torch.tensor([[1.0, 0.0], [0.0, -2.0]], requires_grad=True),
+        },
+    ]),
     Example(Normal, [
         {
             'loc': torch.tensor([1., 1.], requires_grad=True),
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 014a07e53c9532..345fe35ceee614 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -125,27 +125,29 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri
             if scale_tril.dim() < 2:
                 raise ValueError("scale_tril matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            self._unbroadcasted_scale_tril = scale_tril
             self.scale_tril, loc_ = torch.broadcast_tensors(scale_tril, loc_)
         elif covariance_matrix is not None:
             if covariance_matrix.dim() < 2:
                 raise ValueError("covariance_matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            self._unbroadcasted_scale_tril = _batch_potrf_lower(covariance_matrix)
             self.covariance_matrix, loc_ = torch.broadcast_tensors(covariance_matrix, loc_)
         else:
             if precision_matrix.dim() < 2:
                 raise ValueError("precision_matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            covariance_matrix = _batch_inverse(precision_matrix)
-            self._unbroadcasted_scale_tril = _batch_potrf_lower(covariance_matrix)
-            self.covariance_matrix, self.precision_matrix, loc_ = torch.broadcast_tensors(
-                covariance_matrix, precision_matrix, loc_)
+            self.precision_matrix, loc_ = torch.broadcast_tensors(precision_matrix, loc_)
         self.loc = loc_[..., 0]  # drop rightmost dim
 
         batch_shape, event_shape = self.loc.shape[:-1], self.loc.shape[-1:]
         super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+        if scale_tril is not None:
+            self._unbroadcasted_scale_tril = scale_tril
+        else:
+            if precision_matrix is not None:
+                self.covariance_matrix = _batch_inverse(precision_matrix).expand_as(loc_)
+            self._unbroadcasted_scale_tril = _batch_potrf_lower(self.covariance_matrix)
+
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(MultivariateNormal, _instance)
         batch_shape = torch.Size(batch_shape)

From 5be0baefa2cd4202810d6971366c439a44dda69c Mon Sep 17 00:00:00 2001
From: Luca Antiga <luca.antiga@orobix.com>
Date: Fri, 28 Sep 2018 07:41:26 -0700
Subject: [PATCH 40/82] Use streams in JIT serialization, allow JIT
 serialization to/from buffer (#11932)

Summary:
This PR replaces the use of `std::FILE` with `istream`/`ostream` for JIT serialization.
It uses this mechanism to add the possibility to serialize to/from binary buffers, in addition to files, both in `libtorch` and from Python.

`getExportImportCopy` in `test_jit.py` has been updated so that both file and buffer codepaths are exercised during tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11932

Differential Revision: D10084303

Pulled By: apaszke

fbshipit-source-id: b850801b3932922fa1dbac6fdaed5063d58bc20d
---
 test/test_jit.py                 |  12 ++--
 torch/csrc/jit/export.cpp        |  25 +++++---
 torch/csrc/jit/export.h          |   6 ++
 torch/csrc/jit/import.cpp        |  37 ++++++++---
 torch/csrc/jit/import.h          |   9 +++
 torch/csrc/jit/init.cpp          |   1 -
 torch/csrc/jit/script/init.cpp   |  17 ++++-
 torch/csrc/jit/script/module.cpp |   4 ++
 torch/csrc/jit/script/module.h   |   3 +
 torch/csrc/jit/serialization.h   | 106 ++++++++++++++++++-------------
 torch/jit/__init__.py            |  56 ++++++++++++++--
 11 files changed, 200 insertions(+), 76 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 85c6ce991292da..22e7a5f69b467c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -240,14 +240,10 @@ def getExportImportCopy(self, m):
             imported = torch.jit.load(f.name)
         finally:
             os.unlink(f.name)
-        f = tempfile.NamedTemporaryFile(delete=False)
-        try:
-            f.close()
-            imported.save(f.name)
-            imported = torch.jit.load(f.name)
-        finally:
-            os.unlink(f.name)
-        return imported
+        buffer = io.BytesIO()
+        torch.jit.save(imported, buffer)
+        buffer.seek(0)
+        return torch.jit.load(buffer)
 
     def assertGraphContains(self, graph, kind):
         self.assertTrue(any(n.kind() == kind for n in graph.nodes()))
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index d9a9e79bc714d2..973780b7d7d62f 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -15,6 +15,7 @@
 #include <vector>
 #include <string>
 #include <sstream>
+#include <fstream>
 
 namespace torch { namespace jit {
 
@@ -425,7 +426,7 @@ void GraphEncoder::EncodeTensor(
 class ModuleEncoder: public EncoderBase {
  public:
   ModuleEncoder(const script::Module &module,
-                const std::string &filename);
+                std::ostream& out);
 
  private:
   void EncodeModule(onnx::GraphProto *graph_proto, const script::Module &module);
@@ -448,7 +449,7 @@ class ModuleEncoder: public EncoderBase {
 
   virtual void EncodeTensor(onnx::TensorProto *tensor_proto,
                             const at::Tensor &tensor,
-                            const at::optional<std::string> external_ref) override;
+                            const at::optional<std::string> external_ref = {}) override;
 
   virtual void EncodeIntermediateValueInfo(onnx::GraphProto *graph_proto,
                                            const Value* n) override;
@@ -462,7 +463,7 @@ class ModuleEncoder: public EncoderBase {
                       const TypePtr& type,
                       const std::string& name);
 
-  PyTorchFileWriter file_writer_;
+  PyTorchStreamWriter stream_writer_;
   // Used to deduplicate tensor storages
   std::unordered_map<const void*, uint64_t> storage_dedup_map_;
 
@@ -475,9 +476,9 @@ class ModuleEncoder: public EncoderBase {
 
 ModuleEncoder::ModuleEncoder(
     const script::Module &module,
-    const std::string &filename)
+    std::ostream& out)
     : EncoderBase(onnx_torch::OperatorExportTypes::RAW, false),
-      file_writer_(filename) {
+      stream_writer_(out) {
   model_proto_.set_doc_string("THIS PROTO IS NOT STANDARD ONNX");
   EncodeModule(model_proto_.mutable_graph(), module);
 }
@@ -586,7 +587,7 @@ void ModuleEncoder::EncodeModule(
   EncodeParameters(graph_proto, module, "");
   EncodeMethods(graph_proto, module, "");
   auto str = model_proto_.SerializeAsString();
-  file_writer_.writeRecord(str.data(), str.size());
+  stream_writer_.writeRecord(str.data(), str.size());
 }
 
 void ModuleEncoder::EncodeParameters(
@@ -674,7 +675,7 @@ void ModuleEncoder::EncodeMethod(
 void ModuleEncoder::EncodeTensor(
     onnx::TensorProto *tensor_proto,
     const at::Tensor &tensor,
-    const at::optional<std::string> external_ref = {}) {
+    const at::optional<std::string> external_ref) {
   auto storage_ptr = tensor.storage().unsafeGetStorageImpl();
   auto dedup_it = storage_dedup_map_.find(storage_ptr);
   if (dedup_it != storage_dedup_map_.end()) {
@@ -693,7 +694,7 @@ void ModuleEncoder::EncodeTensor(
         .cpu();
     }
 
-    auto record_number = file_writer_.writeRecord(
+    auto record_number = stream_writer_.writeRecord(
       static_cast<char*>(t.storage().data()), t.type().elementSizeInBytes() * t.storage().size());
     tensor_proto->add_int64_data(record_number);
     storage_dedup_map_[storage_ptr] = record_number;
@@ -919,8 +920,14 @@ std::tuple<std::string, RawDataExportMap> ExportGraph(
                          graph_encoder.get_raw_data_export_map());
 }
 
+void ExportModule(const script::Module& module, std::ostream& out) {
+  ModuleEncoder(module, out);
+}
+
 void ExportModule(const script::Module& module, const std::string &filename) {
-  ModuleEncoder(module, filename);
+  std::ofstream out(filename, std::ios_base::binary);
+
+  ExportModule(module, out);
 }
 
 }}
diff --git a/torch/csrc/jit/export.h b/torch/csrc/jit/export.h
index f7eee3dc77ac07..363de0b56ac169 100644
--- a/torch/csrc/jit/export.h
+++ b/torch/csrc/jit/export.h
@@ -4,6 +4,8 @@
 #include "torch/csrc/jit/script/module.h"
 #include "torch/csrc/onnx/onnx.h"
 
+#include <ostream>
+
 namespace torch { namespace jit {
 
 // This map is used to keep track of parameters that should be exported
@@ -34,6 +36,10 @@ TORCH_API std::string PrettyPrintExportedGraph(
       = ::torch::onnx::OperatorExportTypes::ONNX,
     bool google_printer = false);
 
+TORCH_API void ExportModule(
+    const script::Module& module,
+    std::ostream& out);
+
 TORCH_API void ExportModule(
     const script::Module& module,
     const std::string& filename);
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 212c28e4c10db1..4574addb3a4465 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -11,6 +11,7 @@
 #include <unordered_map>
 #include <vector>
 #include <string>
+#include <fstream>
 
 namespace torch { namespace jit {
 
@@ -181,7 +182,7 @@ void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block,
 class ModuleDecoder : DecoderBase {
  public:
   ModuleDecoder(ModuleLookup module_lookup,
-                const std::string& filename);
+                std::istream& in);
 
  private:
   virtual std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto) override;
@@ -205,7 +206,7 @@ class ModuleDecoder : DecoderBase {
       ModuleLookup module_lookup,
       const std::string fullname);
 
-  PyTorchFileReader file_reader_;
+  PyTorchStreamReader stream_reader_;
   std::unordered_map<uint64_t, std::shared_ptr<at::Storage>> storage_map_;
   std::unordered_map<std::string, const onnx::TypeProto*> value_type_map_;
 };
@@ -319,7 +320,7 @@ at::Tensor ModuleDecoder::buildTensorCommon(
   if (storage_it == storage_map_.end()) {
     at::DataPtr storage_ptr;
     int64_t size;
-    std::tie(storage_ptr, size) = file_reader_.getRecordWithKey(record_number);
+    std::tie(storage_ptr, size) = stream_reader_.getRecordWithKey(record_number);
     auto storage = std::make_shared<at::Storage>(
       at::CPU(type).typeMeta(),
       std::move(storage_ptr),
@@ -353,10 +354,10 @@ std::pair<std::shared_ptr<script::Module>, std::string> ModuleDecoder::parseFull
 
 ModuleDecoder::ModuleDecoder(
     ModuleLookup module_lookup,
-    const std::string &filename) :
-    file_reader_(filename) {
+    std::istream& in) :
+    stream_reader_(in) {
   auto model_proto = onnx::ModelProto();
-  auto record = file_reader_.getLastRecord();
+  auto record = stream_reader_.getLastRecord();
   model_proto.ParsePartialFromArray(std::get<0>(record).get(), std::get<1>(record));
   auto graph_proto = model_proto.graph();
 
@@ -395,13 +396,21 @@ ModuleDecoder::ModuleDecoder(
 
 }  // namespace
 
+void import_ir_module(
+    ModuleLookup module_lookup,
+    std::istream& in) {
+  ModuleDecoder(module_lookup, in);
+}
+
 void import_ir_module(
     ModuleLookup module_lookup,
     const std::string& filename) {
-  ModuleDecoder(module_lookup, filename);
+  std::ifstream in(filename, std::ios_base::binary);
+
+  ModuleDecoder(module_lookup, in);
 }
 
-std::shared_ptr<script::Module> load(const std::string& filename) {
+std::shared_ptr<script::Module> load(std::istream& in) {
   auto module = std::make_shared<script::Module>();
 
   auto module_lookup = [&](const std::vector<std::string>& qualified_name) {
@@ -414,7 +423,17 @@ std::shared_ptr<script::Module> load(const std::string& filename) {
     }
     return curr;
   };
-  ModuleDecoder(module_lookup, filename);
+
+  ModuleDecoder(module_lookup, in);
+
+  return module;
+}
+
+std::shared_ptr<script::Module> load(const std::string& filename) {
+  std::ifstream in(filename, std::ios_base::binary);
+
+  auto module = load(in);
+
   return module;
 }
 
diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h
index 6ce901c4369961..a1e0b31fe2295a 100644
--- a/torch/csrc/jit/import.h
+++ b/torch/csrc/jit/import.h
@@ -3,6 +3,8 @@
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/script/module.h"
 
+#include <istream>
+
 namespace torch {
 namespace jit {
 
@@ -13,11 +15,18 @@ TORCH_API void import_ir_module(
     ModuleLookup module_lookup,
     const std::string& filename);
 
+TORCH_API void import_ir_module(
+    ModuleLookup module_lookup,
+    std::istream& in);
+
 /// Loads a serialized `script::Module` from the given `filename`.
 ///
 /// The file stored at the location given in `filename` must contain a
 /// serialized `script::Module`, exported either via `ScriptModule.save()` in
 /// Python or `torch::jit::ExportModule` in C++.
+
+TORCH_API std::shared_ptr<script::Module> load(std::istream& in);
+
 TORCH_API std::shared_ptr<script::Module> load(const std::string& filename);
 
 } // namespace jit
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 98a7b010419324..ac6f9ac4a15c1c 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -227,7 +227,6 @@ void initJITBindings(PyObject *module) {
         return createPyObjectForStack(std::move(stack));
       });
 
-
     py::class_<PyTorchFileWriter>(m, "PyTorchFileWriter")
       .def(py::init<std::string>())
       .def("write_record", &PyTorchFileWriter::writeRecord)
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index abc0560bbc1464..4c7df820b13b4f 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -371,7 +371,14 @@ void initJitScriptBindings(PyObject* module) {
   // public.
   py::class_<Module, std::shared_ptr<Module>>(m, "ScriptModule")
       .def(py::init<>())
-      .def("save", &Module::save)
+      .def("save", [](std::shared_ptr<Module> m, const std::string& filename) {
+          m->save(filename);
+      })
+      .def("save_to_buffer", [](std::shared_ptr<Module> m) {
+          std::ostringstream buf;
+          m->save(buf);
+          return py::bytes(buf.str());
+      })
       .def("_set_optimized", &Module::set_optimized)
       .def(
           "_define",
@@ -534,7 +541,13 @@ void initJitScriptBindings(PyObject* module) {
   });
 
   m.def("merge_type_from_type_comment", &mergeTypesFromTypeComment);
-  m.def("import_ir_module", import_ir_module);
+  m.def("import_ir_module", [](ModuleLookup module_lookup, const std::string& filename) {
+    import_ir_module(module_lookup, filename);
+  });
+  m.def("import_ir_module_from_buffer", [](ModuleLookup module_lookup, const std::string& buffer) {
+    std::istringstream in(buffer);
+    import_ir_module(module_lookup, in);
+  });
 }
 
 } // namespace script
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
index 4180c93f48c837..47ac4668ca83b2 100644
--- a/torch/csrc/jit/script/module.cpp
+++ b/torch/csrc/jit/script/module.cpp
@@ -71,6 +71,10 @@ void Method::ensure_defined() {
   }
 }
 
+void Module::save(std::ostream& out) {
+  ExportModule(*this, out);
+}
+
 void Module::save(const std::string& filename) {
   ExportModule(*this, filename);
 }
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 50ae9f48fb3c93..2bb4029d83e464 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <ostream>
 
 // This file contains classes which assist in desugaring Python style
 // modules and their methods into flattened graphs which don't have any
@@ -376,6 +377,8 @@ struct Module {
     return get_method(method_name)({IValue(std::forward<Types>(args))...});
   }
 
+  void save(std::ostream& out);
+
   void save(const std::string& filename);
 
  private:
diff --git a/torch/csrc/jit/serialization.h b/torch/csrc/jit/serialization.h
index a4ebd864ac6cc3..9fc8d4a1b688dd 100644
--- a/torch/csrc/jit/serialization.h
+++ b/torch/csrc/jit/serialization.h
@@ -3,6 +3,9 @@
 #include <cstdio>
 #include <cstring>
 #include <cerrno>
+#include <istream>
+#include <ostream>
+#include <fstream>
 
 namespace torch { namespace jit {
 
@@ -75,25 +78,16 @@ namespace {
   static constexpr uint64_t kFileFormatVersion = 0x1L;
   static constexpr uint8_t kPadValue = 0xEF;
 
-  void wrapPErrorAndThrow(const std::string& msg) {
-    std::ostringstream oss;
-    oss << msg << " : " << strerror(errno);
-    throw std::runtime_error(oss.str());
-  }
 }  // namespace
 
-class PyTorchFileReader {
+class PyTorchStreamReader {
  public:
-  PyTorchFileReader(std::string filename) {
-    fp = std::fopen(filename.c_str(), "rb");
-    if (!fp) {
-      wrapPErrorAndThrow("Couldn't open file for reading!");
-    }
+  PyTorchStreamReader(std::istream& in_) : in(in_) {
     // Store file size so we know when we're done reading because the f* APIs
     // don't do a good job of that
-    std::fseek(fp, 0L, SEEK_END);
-    file_size = std::ftell(fp);
-    std::fseek(fp, 0L, SEEK_SET);
+    in.seekg(0L, in.end);
+    file_size = in.tellg();
+    in.seekg(0L);
     readAndValidateFileHeader();
     // Do this now since we're reasonably sure this is actually a PyT file from
     // the header.
@@ -115,7 +109,7 @@ class PyTorchFileReader {
     }
     // Seek to the provided offset
     cursor = key;
-    std::fseek(fp, cursor, SEEK_SET);
+    in.seekg(cursor);
     auto tag = read64BitIntegerLittleEndian();
     if (tag != RecordTags::STORAGE) {
       throw std::runtime_error("Attempted to read a record of non-storage type");
@@ -124,18 +118,16 @@ class PyTorchFileReader {
     seekToNextAlignmentBoundary();
     auto ptr = malloc(size);
     at::DataPtr retval(ptr, ptr, free, at::kCPU);
-    if (!std::fread(ptr, size, 1, fp)) {
-      wrapPErrorAndThrow("Failed to read data from record");
-    }
+
+    in.read((char*)ptr, size);
     cursor += size;
     seekToNextAlignmentBoundary();
     return std::tuple<at::DataPtr, size_t>(std::move(retval), size);
   }
-  ~PyTorchFileReader() {
-    std::fclose(fp);
+  ~PyTorchStreamReader() {
   }
  private:
-  FILE *fp;
+  std::istream& in;
   size_t cursor = 0;
   size_t file_size;
   size_t last_record_offset;
@@ -144,8 +136,9 @@ class PyTorchFileReader {
   uint64_t read64BitIntegerLittleEndian() {
    uint64_t retval;
    // TODO endian swap on platforms that need it?
-   size_t read_bytes = std::fread(&retval, 1u, 8u, fp);
-   if (read_bytes != 8u) {
+   in.read(reinterpret_cast<char *>(&retval), 8);
+   std::streamsize read_bytes = in.gcount();
+   if (read_bytes != 8) {
      std::ostringstream errmsg;
      errmsg << "Expected to read 8 bytes but got " << read_bytes;
      throw std::runtime_error(errmsg.str());
@@ -158,7 +151,7 @@ class PyTorchFileReader {
    size_t next_offset = (cursor + kFieldAlignment) - (cursor % kFieldAlignment);
    size_t pad_amount = next_offset - cursor;
    cursor += pad_amount;
-   std::fseek(fp, cursor, SEEK_SET);
+   in.seekg(cursor);
   }
 
   // File format deserialization functions
@@ -183,7 +176,7 @@ class PyTorchFileReader {
     // Seek to location of file footer. We've already validated that the file
     // length is a multiple of the alignment size
     cursor = file_size - kFieldAlignment;
-    std::fseek(fp, cursor, SEEK_SET);
+    in.seekg(cursor);
     auto tag = read64BitIntegerLittleEndian();
     if (tag != RecordTags::FOOTER) {
       throw std::runtime_error("File footer has wrong record type. Is this"
@@ -197,13 +190,9 @@ class PyTorchFileReader {
   }
 };
 
-class PyTorchFileWriter {
+class PyTorchStreamWriter {
  public:
-  PyTorchFileWriter(const std::string& filename) {
-    fp = std::fopen(filename.c_str(), "wb");
-    if (!fp) {
-      wrapPErrorAndThrow("Unable to open PyTorch file for writing!");
-    }
+  PyTorchStreamWriter(std::ostream& out_) : out(out_) {
     writeFileHeader();
     // In the case that we do not write any records into this file, the last
     // record index written into the footer will point to the footer itself.
@@ -224,15 +213,14 @@ class PyTorchFileWriter {
     JIT_ASSERT(!finalized);
     writeFileFooter();
     finalized = true;
-    std::fclose(fp);
   }
-  ~PyTorchFileWriter() {
+  ~PyTorchStreamWriter() {
     if (!finalized) {
       writeEndOfFile();
     }
   }
  private:
-  FILE *fp;
+  std::ostream& out;
   size_t cursor = 0;
   bool finalized = false;
   size_t last_record_idx = 0;
@@ -240,17 +228,13 @@ class PyTorchFileWriter {
   // Utility functions
   void write64BitIntegerLittleEndian(const uint64_t value) {
     // TODO endian swap on platforms that need it?
-    if (!std::fwrite(&value, 8u, 1u, fp)) {
-      wrapPErrorAndThrow("Unable to write to file!");
-    }
+    out.write(reinterpret_cast<const char *>(&value), 8);
     cursor += 8u;
   }
 
   void writePad(const size_t num_bytes) {
     static std::vector<char> pad_buffer(kPadValue, kFieldAlignment);
-    if (!std::fwrite(pad_buffer.data(), num_bytes, 1u, fp)) {
-      wrapPErrorAndThrow("Unable to write to file!");
-    }
+    out.write(pad_buffer.data(), num_bytes);
     cursor += num_bytes;
   }
 
@@ -261,9 +245,7 @@ class PyTorchFileWriter {
   }
 
   void writeBuffer(const char* data, size_t size) {
-    if (!std::fwrite(data, size, 1u, fp)) {
-      wrapPErrorAndThrow("Unable to write to file!");
-    }
+    out.write(data, size);
     cursor += size;
   }
 
@@ -281,5 +263,43 @@ class PyTorchFileWriter {
   }
 };
 
+class PyTorchFileReader {
+ public:
+  PyTorchFileReader(const std::string& filename) :
+    in(filename, std::ios_base::binary),
+    stream_reader(in) {}
+
+  std::tuple<at::DataPtr, size_t> getLastRecord() {
+    return stream_reader.getLastRecord();
+  }
+
+  std::tuple<at::DataPtr, size_t> getRecordWithKey(uint64_t key) {
+    return stream_reader.getRecordWithKey(key);
+  }
+
+ private:
+  std::ifstream in;
+  PyTorchStreamReader stream_reader;
+};
+
+class PyTorchFileWriter {
+ public:
+  PyTorchFileWriter(const std::string& filename) :
+    out(filename, std::ios_base::binary),
+    stream_writer(out) {}
+
+  uint64_t writeRecord(const char* data, size_t size) {
+    return stream_writer.writeRecord(data, size);
+  }
+
+  void writeEndOfFile() {
+    stream_writer.writeEndOfFile();
+    out.close();
+  }
+
+ private:
+  std::ofstream out;
+  PyTorchStreamWriter stream_writer;
+};
 
 }}  // namespace torch::jit
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f7cea00e6292fd..21553e4aca9915 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -20,6 +20,8 @@
 import numbers
 import collections
 import re
+if sys.version_info[0] > 2:
+    import pathlib
 
 
 def _parse_env(name, default, true_message, false_message):
@@ -58,19 +60,27 @@ def scope(scope_name):
             tracing_state.pop_scope()
 
 
-def load(filename):
+def load(f):
     r"""
-        Load a ``ScriptModule`` previously saved with :func:`save <torch.jit.ScriptModule.save>`
+        Load a ``ScriptModule`` previously saved with :func:`save <torch.jit.save>`
 
         .. DANGER::
            All previously saved modules, no matter their device, are always loaded onto the CPU.
            This is different from :func:`torch.load`'s semantics and may change in the future.
 
         Arguments:
-            filename (string): the file to load
+            f: a file-like object (has to implement read, readline, tell, and seek),
+                or a string containing a file name
 
         Returns:
             A ``ScriptModule`` object.
+
+        Example:
+            >>> torch.jit.load('scriptmodule.pt')
+            # Load ScriptModule from io.BytesIO object
+            >>> with open('scriptmodule.pt', 'rb') as f:
+                    buffer = io.BytesIO(f.read())
+            >>> torch.jit.load(buffer)
     """
     m = ScriptModule()
 
@@ -82,10 +92,48 @@ def module_lookup(names):
             curr = getattr(curr, name)
         return curr
 
-    torch._C.import_ir_module(module_lookup, filename)
+    if isinstance(f, str) or \
+            (sys.version_info[0] == 2 and isinstance(f, unicode)) or \
+            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
+        torch._C.import_ir_module(module_lookup, f)
+    else:
+        torch._C.import_ir_module_from_buffer(module_lookup, f.read())
     return m
 
 
+def save(m, f):
+    """
+        Saves a ScriptModule to a file.
+
+        Args:
+            m: a ScriptModule to save
+            f: a file-like object (has to implement write and flush) or a string
+               containing a file name
+
+        .. warning::
+            If you are using Python 2, torch.save does NOT support StringIO.StringIO
+            as a valid file-like object. This is because the write method should return
+            the number of bytes written; StringIO.write() does not do this.
+
+            Please use something like io.BytesIO instead.
+
+        Example:
+            >>> m = torch.jit.ScriptModule()
+            >>> # Save to file
+            >>> torch.jit.save(m, 'scriptmodule.pt')
+            >>> # Save to io.BytesIO buffer
+            >>> buffer = io.BytesIO()
+            >>> torch.jit.save(m, buffer)
+    """
+    if isinstance(f, str) or \
+            (sys.version_info[0] == 2 and isinstance(f, unicode)) or \
+            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
+        m.save(f)
+    else:
+        ret = m.save_to_buffer()
+        f.write(ret)
+
+
 def get_trace_graph(f, args=(), kwargs=None):
     """
     Trace a function or model, returning a tuple consisting of the both the

From b0248df72ae09213d9024a74ab40f31be03bdd87 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Fri, 28 Sep 2018 08:28:30 -0700
Subject: [PATCH 41/82] =?UTF-8?q?Docs:=20Change=20cuda(async)=20=E2=80=94>?=
 =?UTF-8?q?=20cuda(non=5Fblocking)=20(#12158)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
goldsborough Modify the docs to match the changes made in #4999
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12158

Differential Revision: D10103964

Pulled By: SsnL

fbshipit-source-id: 1b8692da86aca1a52e8d2e6cea76a5ad1f71e058
---
 docs/source/notes/cuda.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 63047bb11fddff..212f68e694d7f9 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -74,9 +74,10 @@ You can force synchronous computation by setting environment variable
 operation is actually executed, so the stack trace does not show where it was
 requested.)
 
-As an exception, several functions such as :meth:`~torch.Tensor.copy_` admit
-an explicit :attr:`async` argument, which lets the caller bypass synchronization
-when it is unnecessary.  Another exception is CUDA streams, explained below.
+As an exception, several functions such as :meth:`~torch.Tensor.to` and 
+:meth:`~torch.Tensor.copy_` admit an explicit :attr:`non_blocking` argument, 
+which lets the caller bypass synchronization when it is unnecessary.  
+Another exception is CUDA streams, explained below.
 
 CUDA streams
 ^^^^^^^^^^^^

From 0aff3cc559427c8025fbfd5f9ed28b9336f1657a Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritzo@uber.com>
Date: Fri, 28 Sep 2018 09:39:02 -0700
Subject: [PATCH 42/82] Fix broadcasting bug in StudentT (#12148)

Summary:
This fixes a broadcasting error with the `StudentT` distribution

- [x] added a regression test
- [x] strengthened parameter broadcasting tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12148

Differential Revision: D10099226

Pulled By: soumith

fbshipit-source-id: 0c5eb14180d158f8fff28ceb9e7cd3471c2bb803
---
 test/test_distributions.py      | 14 +++++++++++---
 torch/distributions/studentT.py |  4 ++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/test/test_distributions.py b/test/test_distributions.py
index 69aa6cd73b465e..2c489d858c1238 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -2378,12 +2378,20 @@ def test_valid_parameter_broadcasting(self):
              (1, 2)),
             (StudentT(df=torch.tensor([1.]), scale=torch.tensor([[1.]])),
              (1, 1)),
+            (StudentT(df=1., loc=torch.zeros(5, 1), scale=torch.ones(3)),
+             (5, 3)),
         ]
 
         for dist, expected_size in valid_examples:
-            dist_sample_size = dist.sample().size()
-            self.assertEqual(dist_sample_size, expected_size,
-                             'actual size: {} != expected size: {}'.format(dist_sample_size, expected_size))
+            actual_size = dist.sample().size()
+            self.assertEqual(actual_size, expected_size,
+                             '{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size))
+
+            sample_shape = torch.Size((2,))
+            expected_size = sample_shape + expected_size
+            actual_size = dist.sample(sample_shape).size()
+            self.assertEqual(actual_size, expected_size,
+                             '{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size))
 
     def test_invalid_parameter_broadcasting(self):
         # invalid broadcasting cases; should throw error
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index 6530940b328e7f..3e995d1477faed 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -41,8 +41,8 @@ def variance(self):
 
     def __init__(self, df, loc=0., scale=1., validate_args=None):
         self.df, self.loc, self.scale = broadcast_all(df, loc, scale)
-        self._chi2 = Chi2(df)
-        batch_shape = torch.Size() if isinstance(df, Number) else self.df.size()
+        self._chi2 = Chi2(self.df)
+        batch_shape = self.df.size()
         super(StudentT, self).__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):

From 65bf181ddf6ea27f77d09d98c67996c991848c42 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Fri, 28 Sep 2018 09:40:58 -0700
Subject: [PATCH 43/82] Add "ai.onnx.pytorch" onnx domain (#12157)

Summary:
zrphercule
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12157

Differential Revision: D10100799

Pulled By: bddppq

fbshipit-source-id: 76fdd126e0b52c54276752b3b0174735355a7d2f
---
 caffe2/onnx/torch_ops/CMakeLists.txt  |  5 +++++
 caffe2/onnx/torch_ops/constants.h     |  7 +++++++
 caffe2/onnx/torch_ops/defs.cc         | 24 ++++++++++++++++++++++++
 caffe2/onnx/torch_ops/operator_sets.h | 22 ++++++++++++++++++++++
 caffe2/onnx/torch_ops/schema.cc       | 17 +++++++++++++++++
 caffe2/onnx/torch_ops/schema.h        |  8 ++++++++
 cmake/Dependencies.cmake              |  3 +++
 7 files changed, 86 insertions(+)
 create mode 100644 caffe2/onnx/torch_ops/CMakeLists.txt
 create mode 100644 caffe2/onnx/torch_ops/constants.h
 create mode 100644 caffe2/onnx/torch_ops/defs.cc
 create mode 100644 caffe2/onnx/torch_ops/operator_sets.h
 create mode 100644 caffe2/onnx/torch_ops/schema.cc
 create mode 100644 caffe2/onnx/torch_ops/schema.h

diff --git a/caffe2/onnx/torch_ops/CMakeLists.txt b/caffe2/onnx/torch_ops/CMakeLists.txt
new file mode 100644
index 00000000000000..99443af4cc9bc6
--- /dev/null
+++ b/caffe2/onnx/torch_ops/CMakeLists.txt
@@ -0,0 +1,5 @@
+# ---[ Extra onnx files.
+file(GLOB ONNX_SRCS *.cc)
+
+# ---[ Send the lists to the parent scope.
+set(ONNX_SRCS ${ONNX_SRCS} PARENT_SCOPE)
diff --git a/caffe2/onnx/torch_ops/constants.h b/caffe2/onnx/torch_ops/constants.h
new file mode 100644
index 00000000000000..ebd2a2464d9b33
--- /dev/null
+++ b/caffe2/onnx/torch_ops/constants.h
@@ -0,0 +1,7 @@
+namespace ONNX_NAMESPACE {
+
+const int AI_ONNX_PYTORCH_DOMAIN_MIN_OPSET = 1;
+const int AI_ONNX_PYTORCH_DOMAIN_MAX_OPSET = 1;
+constexpr const char* AI_ONNX_PYTORCH_DOMAIN = "ai.onnx.pytorch";
+
+} // namespace ONNX_NAMESPACE
diff --git a/caffe2/onnx/torch_ops/defs.cc b/caffe2/onnx/torch_ops/defs.cc
new file mode 100644
index 00000000000000..8d03120af03557
--- /dev/null
+++ b/caffe2/onnx/torch_ops/defs.cc
@@ -0,0 +1,24 @@
+// Copyright (c) Facebook Inc. and Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "./schema.h"
+
+namespace ONNX_NAMESPACE {
+
+static const char* dummy_test_only_ver1_doc = R"DOC(
+A dummy op for verifying the build setup works, don't use me.
+)DOC";
+
+ONNX_PYTORCH_OPERATOR_SET_SCHEMA(
+    DUMMY_TEST_ONLY,
+    1,
+    OpSchema()
+        .SetDoc(dummy_test_only_ver1_doc)
+        .Input(0, "input", "Input tensor", "T")
+        .Output(0, "output", "Output tensor", "T")
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors."));
+
+} // namespace ONNX_NAMESPACE
diff --git a/caffe2/onnx/torch_ops/operator_sets.h b/caffe2/onnx/torch_ops/operator_sets.h
new file mode 100644
index 00000000000000..760a6b7fa2a7b6
--- /dev/null
+++ b/caffe2/onnx/torch_ops/operator_sets.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "onnx/defs/schema.h"
+
+namespace ONNX_NAMESPACE {
+
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(PyTorch, 1, DUMMY_TEST_ONLY);
+
+// Iterate over schema from ai.onnx.pytorch domain opset 1
+class OpSet_PyTorch_ver1 {
+ public:
+  static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(
+           PyTorch, 1, DUMMY_TEST_ONLY)>());
+  }
+};
+
+inline void RegisterPyTorchOperatorSetSchema() {
+  RegisterOpSetSchema<OpSet_PyTorch_ver1>();
+}
+
+} // namespace ONNX_NAMESPACE
diff --git a/caffe2/onnx/torch_ops/schema.cc b/caffe2/onnx/torch_ops/schema.cc
new file mode 100644
index 00000000000000..de933c2c23ab2e
--- /dev/null
+++ b/caffe2/onnx/torch_ops/schema.cc
@@ -0,0 +1,17 @@
+#include "./schema.h"
+#include "./operator_sets.h"
+
+namespace {
+using namespace ONNX_NAMESPACE;
+class PyTorchSchemasRegisterer {
+ public:
+  PyTorchSchemasRegisterer() {
+    OpSchemaRegistry::DomainToVersionRange::Instance().AddDomainToVersion(
+        AI_ONNX_PYTORCH_DOMAIN,
+        AI_ONNX_PYTORCH_DOMAIN_MIN_OPSET,
+        AI_ONNX_PYTORCH_DOMAIN_MAX_OPSET);
+    RegisterPyTorchOperatorSetSchema();
+  }
+};
+static PyTorchSchemasRegisterer registerer{};
+} // namespace
diff --git a/caffe2/onnx/torch_ops/schema.h b/caffe2/onnx/torch_ops/schema.h
new file mode 100644
index 00000000000000..3454e366a1eeba
--- /dev/null
+++ b/caffe2/onnx/torch_ops/schema.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "./constants.h"
+#include "onnx/defs/schema.h"
+
+#define ONNX_PYTORCH_OPERATOR_SET_SCHEMA(name, ver, impl) \
+  ONNX_OPERATOR_SET_SCHEMA_EX(                            \
+      name, PyTorch, AI_ONNX_PYTORCH_DOMAIN, ver, false, impl)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 82aff7b8cc87d5..d9ddfcdfc4f9ee 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -797,7 +797,10 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   if (CAFFE2_LINK_LOCAL_PROTOBUF)
     set(ONNX_PROTO_POST_BUILD_SCRIPT ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake)
   endif()
+  # Add op schemas in "ai.onnx.pytorch" domain
+  add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../caffe2/onnx/torch_ops")
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx)
+
   include_directories(${ONNX_INCLUDE_DIRS})
   add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE})
   # In mobile build we care about code size, and so we need drop

From e7e10e60e09c8eb133b1addab1490b6320a7c366 Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Fri, 28 Sep 2018 10:50:46 -0700
Subject: [PATCH 44/82] Introduce builtin script functions (#12141)

Summary:
This functionality replaces the Scalar-Tensor builtin operators,
with builtin functions.

Builtin functions are used in place of operators where one operator
can be defined using a composition of another. This simplifies later
optimization passes by allowing us to have fewer operator.

In the future, builtin functions can be used for other purposes.
For example, we can define derivative functions as code rather than
building graphs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12141

Reviewed By: ezyang

Differential Revision: D10088065

Pulled By: zdevito

fbshipit-source-id: a2acb06346e649c4c8a2fe423b420871161c21cf
---
 .../TestJit.test_constant_prop_print.expect   |  5 +-
 .../TestJit.test_constant_prop_simple.expect  |  5 +-
 .../TestScript.test_scalar_fusion.expect      |  2 +-
 torch/CMakeLists.txt                          |  1 +
 torch/csrc/jit/autodiff.cpp                   | 16 ----
 torch/csrc/jit/passes/graph_fuser.cpp         | 12 +--
 torch/csrc/jit/passes/shape_analysis.cpp      |  9 +-
 torch/csrc/jit/register_prim_ops.cpp          | 36 --------
 torch/csrc/jit/script/builtin_functions.cpp   | 83 +++++++++++++++++++
 torch/csrc/jit/script/builtin_functions.h     | 13 +++
 torch/csrc/jit/script/compiler.cpp            | 16 ++++
 torch/csrc/jit/script/module.cpp              | 42 ++++++++--
 torch/csrc/jit/script/module.h                | 15 ++++
 13 files changed, 171 insertions(+), 84 deletions(-)
 create mode 100644 torch/csrc/jit/script/builtin_functions.cpp
 create mode 100644 torch/csrc/jit/script/builtin_functions.h

diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect
index 5bc86daf4765c7..6f72acc4c8483e 100644
--- a/test/expect/TestJit.test_constant_prop_print.expect
+++ b/test/expect/TestJit.test_constant_prop_print.expect
@@ -2,6 +2,7 @@ graph(%input_tensor : Dynamic) {
   %1 : int = prim::Constant[value=6]()
    = prim::Print(%1)
   %2 : int = prim::Constant[value=8]()
-  %3 : Dynamic = aten::add(%2, %input_tensor)
-  return (%3);
+  %3 : int = prim::Constant[value=1]()
+  %4 : Dynamic = aten::add(%input_tensor, %2, %3)
+  return (%4);
 }
diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect
index 029f9ac05a0783..71cf099a54a663 100644
--- a/test/expect/TestJit.test_constant_prop_simple.expect
+++ b/test/expect/TestJit.test_constant_prop_simple.expect
@@ -1,5 +1,6 @@
 graph(%input_tensor : Dynamic) {
   %1 : int = prim::Constant[value=8]()
-  %2 : Dynamic = aten::add(%1, %input_tensor)
-  return (%2);
+  %2 : int = prim::Constant[value=1]()
+  %3 : Dynamic = aten::add(%input_tensor, %1, %2)
+  return (%3);
 }
diff --git a/test/expect/TestScript.test_scalar_fusion.expect b/test/expect/TestScript.test_scalar_fusion.expect
index e2fd92a0f5739c..565855f262d16c 100644
--- a/test/expect/TestScript.test_scalar_fusion.expect
+++ b/test/expect/TestScript.test_scalar_fusion.expect
@@ -6,7 +6,7 @@ graph(%x : Float()
 with prim::FusionGroup_0 = graph(%0 : Float()
       %1 : Float()) {
   %2 : int = prim::Constant[value=2]()
-  %3 : Float() = aten::mul(%2, %1)
+  %3 : Float() = aten::mul(%1, %2)
   %4 : int = prim::Constant[value=1]()
   %5 : Float() = aten::add(%3, %0, %4)
   return (%5);
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index e9b275bc7d2cab..27e580e8965edf 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -175,6 +175,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/script/builtin_functions.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp
   ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 80e196c1b8ac59..a59c856eaba751 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -27,14 +27,10 @@ bool isDifferentiable(Node * n) {
   static OperatorSet differentiable_ops = {
     "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
     "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
-    "aten::add(Scalar other, Tensor self) -> Tensor",
     "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
     "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
-    "aten::sub(Scalar other, Tensor self) -> Tensor",
     "aten::mul(Tensor self, Tensor other) -> Tensor",
     "aten::mul(Tensor self, Scalar other) -> Tensor",
-    "aten::mul(Scalar other, Tensor self) -> Tensor",
-    "aten::div(Scalar other, Tensor self) -> Tensor",
     "aten::div(Tensor self, Tensor other) -> Tensor",
     "aten::div(Tensor self, Scalar other) -> Tensor",
     "aten::sigmoid(Tensor self) -> Tensor",
@@ -132,9 +128,6 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
       return {grads.at(0), nullptr, nullptr};
 
-    } else if (node->matches("aten::add(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, grads.at(0)};
-
     } else if (node->kind() == prim::AutogradAdd) {
       return {grads.at(0), grads.at(0)};
 
@@ -144,27 +137,18 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     } else if (node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
       return {grads.at(0), nullptr, nullptr};
 
-    } else if (node->matches("aten::sub(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, -grads.at(0)};
-
     } else if (node->matches("aten::mul(Tensor self, Tensor other) -> Tensor")) {
       return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)};
 
     } else if (node->matches("aten::mul(Tensor self, Scalar other) -> Tensor")) {
       return {grads.at(0) * inputs.at(1), nullptr};
 
-    } else if (node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, grads.at(0) * inputs.at(0)};
-
     } else if (node->matches("aten::div(Tensor self, Tensor other) -> Tensor")) {
       return {grads.at(0) / inputs.at(1), -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
 
     } else if (node->matches("aten::div(Tensor self, Scalar other) -> Tensor")) {
       return {grads.at(0) / inputs.at(1), nullptr};
 
-    } else if (node->matches("aten::div(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
-
     } else if (node->matches("aten::sigmoid(Tensor self) -> Tensor")) {
       // TODO: The order of operations matter in this case. This 
       // works for ppc64le and x86_64. Need to look at why the 
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 6c7166f3b43552..c8a1ef566f2a3c 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -208,16 +208,12 @@ struct GraphFuser {
           /*const=*/attr::alpha) ||
         node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
           /*const=*/{attr::other, attr::alpha}) ||
-        node->matches("aten::add(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
           /*const=*/attr::alpha) ||
         node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
           /*const=*/{attr::other, attr::alpha}) ||
-        node->matches("aten::sub(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::mul(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::mul(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::div(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::div(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor", /*const=*/{attr::min, attr::max})) {
       auto inputs = tensorInputs(node);
       return haveSupportedType(inputs);
@@ -225,22 +221,16 @@ struct GraphFuser {
     else if (
         node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::lt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::lt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::le(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::le(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::gt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::gt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::ge(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::ge(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::eq(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::eq(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ne(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::ne(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::ne(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other)) {
+        node->matches("aten::ne(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other)) {
       // comparison operators produce Byte type, and it's ok, check only inputs
       auto inputs = tensorInputs(node);
       return haveSupportedType(inputs);
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index b01d9d3b61359c..eedc7fd0a8a686 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -557,10 +557,6 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) {
     "aten::pow(Tensor self, Scalar exponent) -> Tensor",
     "aten::fmod(Tensor self, Scalar other) -> Tensor",
     "aten::remainder(Tensor self, Scalar other) -> Tensor",
-    "aten::add(Scalar other, Tensor self) -> Tensor",
-    "aten::sub(Scalar other, Tensor self) -> Tensor",
-    "aten::mul(Scalar other, Tensor self) -> Tensor",
-    "aten::div(Scalar other, Tensor self) -> Tensor",
     "aten::pow(Scalar base, Tensor self) -> Tensor",
     "aten::__and__(Tensor self, Scalar other) -> Tensor",
     "aten::__or__(Tensor self, Scalar other) -> Tensor",
@@ -1139,10 +1135,7 @@ bool PropagateCompleteShapeOnNode(Node * node, bool insert_expands,
   } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
              node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
              node->matches("aten::mul(Tensor self, Scalar other) -> Tensor") ||
-             node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor") ||
-             node->matches("aten::add(Scalar other, Tensor self) -> Tensor") ||
-             node->matches("aten::sub(Scalar other, Tensor self) -> Tensor") ||
-             node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) {
+             node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor")) {
     node->output()->setType(tensor_types.at(0));
     return true;
   } else if (insert_expands && (
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index be57a03c9ff48e..cdea4ab894b253 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -480,26 +480,6 @@ RegisterOperators reg({
 #define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, float)
 #define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, int)
 
-// define helpers for where aten is missing scalar overloads
-// note: it would be better to define these in a standard library as
-// script functions and have the compiler substitute them in
-// however, we need to add type annotations to the parser in order for us
-// to move them there.
-// e.g. s + t ==> t + s
-// e.g. s - d == -d + s
-
-#define DEFINE_ST_OP(aten_op, reverse_exp)                             \
-  Operator("aten::" #aten_op "(Scalar other, Tensor self) -> Tensor", [](Node* node) { \
-    return [=](Stack& stack) {                                         \
-      at::Scalar a;                                                    \
-      at::Tensor b;                                                    \
-      pop(stack, a, b);                                                \
-      at::DeviceGuard guard(b);                                        \
-      push(stack, reverse_exp);                                        \
-      return 0;                                                        \
-    };                                                                 \
-  }),
-
 // Convert an python index (which may be negative) into an index usable for a
 // C++ container
 int64_t normalizeIndex(int64_t idx, int64_t list_size) {
@@ -793,21 +773,5 @@ RegisterOperators reg2({
             return 0;
           };
         }),
-    // commutative
-    DEFINE_ST_OP(mul, at::mul(b, a))
-    DEFINE_ST_OP(add, at::add(b, a))
-    DEFINE_ST_OP(ne, at::ne(b, a))
-    DEFINE_ST_OP(eq, at::eq(b, a))
-
-    // comparisons, reverse the condition
-    DEFINE_ST_OP(lt, b > a)
-    DEFINE_ST_OP(le, b >= a)
-    DEFINE_ST_OP(gt, b < a)
-    DEFINE_ST_OP(ge, b <= a)
-
-    // rsub
-    DEFINE_ST_OP(sub, at::add(b.neg(), a))
-    // rdiv
-    DEFINE_ST_OP(div, at::mul(at::reciprocal(b), a))
 });
 }}} // torch::jit::anon
diff --git a/torch/csrc/jit/script/builtin_functions.cpp b/torch/csrc/jit/script/builtin_functions.cpp
new file mode 100644
index 00000000000000..ea82d06879d7c7
--- /dev/null
+++ b/torch/csrc/jit/script/builtin_functions.cpp
@@ -0,0 +1,83 @@
+#include "torch/csrc/jit/script/builtin_functions.h"
+#include "torch/csrc/api/include/torch/jit.h"
+#include "torch/csrc/jit/code_template.h"
+
+namespace torch { namespace jit { namespace script {
+
+auto scalar_operators_source = CodeTemplate(
+R"SCRIPT(
+def mul(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b * a
+def add(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b + a
+def ne(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b != a
+def eq(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b == a
+def lt(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b > a
+def le(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b >= a
+def gt(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b < a
+def ge(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b <= a
+def sub(a : ${Scalar}, b : Tensor) -> Tensor:
+  return torch.neg(b) + a
+def div(a : ${Scalar}, b : Tensor) -> Tensor:
+  return torch.reciprocal(b) * a
+)SCRIPT");
+
+struct BuiltinFunctionRegistry {
+
+  const std::vector<Method*>& getAllBuiltinFunctionsFor(Symbol name) {
+    const static std::vector<Method*> empty;
+    // when initializing the builtin function library, we will re-enter
+    // getAllBuiltinFunctionsFor since it is called in the compiler to
+    // lookup builtins and initializing the builtin functions calls the compiler.
+    // To avoid deadlocking, we use a recursive mutex (same thread can re-lock,
+    // the mutex without waiting), and report no loaded builtins during init.
+    std::lock_guard<std::recursive_mutex> guard(mutex);
+    if(state == INTIIALIZING) {
+      return empty;
+    } else if (state == UNINITIALIZED) {
+      state = INTIIALIZING;
+      loadBuiltinFunctions();
+      state = INITIALIZED;
+    }
+    JIT_ASSERT(state == INITIALIZED);
+    auto it = builtins_by_name.find(name);
+    if(it == builtins_by_name.end())
+      return empty;
+    return it->second;
+  }
+private:
+  void loadSource(const std::string& source) {
+    auto module = std::make_shared<script::Module>();
+    defineMethodsInModule(
+        *module, source, script::nativeResolver, /*self=*/nullptr);
+    modules.push_back(module);
+    for (auto& method : module->get_methods()) {
+      builtins_by_name[Symbol::fromQualString("aten::" + method.key)].push_back(
+          method.value.get());
+    }
+  }
+  void loadBuiltinFunctions() {
+    for(auto scalar : {"float", "int"}) {
+      TemplateEnv env;
+      env.s("Scalar", scalar);
+      loadSource(scalar_operators_source.format(env));
+    }
+  }
+  enum {UNINITIALIZED, INTIIALIZING, INITIALIZED} state = UNINITIALIZED;
+  std::recursive_mutex mutex;
+  std::vector<std::shared_ptr<Module>> modules;
+  std::unordered_map<Symbol, std::vector<Method*>> builtins_by_name;
+};
+
+TORCH_API const std::vector<Method*>& getAllBuiltinFunctionsFor(Symbol name) {
+  static BuiltinFunctionRegistry registry;
+  return registry.getAllBuiltinFunctionsFor(name);
+}
+
+}}}
diff --git a/torch/csrc/jit/script/builtin_functions.h b/torch/csrc/jit/script/builtin_functions.h
new file mode 100644
index 00000000000000..042dc96b1826f0
--- /dev/null
+++ b/torch/csrc/jit/script/builtin_functions.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+#include "torch/csrc/jit/script/module.h"
+
+namespace torch { namespace jit { namespace script {
+
+
+TORCH_API const std::vector<Method*>& getAllBuiltinFunctionsFor(Symbol name);
+
+
+
+}}}
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 3b5c523ce439c2..28aa735fc37249 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -8,6 +8,7 @@
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/utils/object_ptr.h"
 #include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/script/builtin_functions.h"
 
 #include "torch/csrc/jit/constants.h"
 
@@ -721,6 +722,8 @@ Value* emitBuiltinCall(
 
 
   const auto& variants = getAllOperatorsFor(name);
+  const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
+
   std::stringstream failure_messages;
   //first we try to match the schema without any conversion
   //if no schema matches then insert ImplicitTensorToNum
@@ -741,6 +744,19 @@ Value* emitBuiltinCall(
         return emitBuiltinNode(*matched_schema, loc, graph, name);
       }
     }
+    for (Method* method : builtin_functions) {
+      if (auto result = try_emit_call_to(
+              graph,
+              loc,
+              *method,
+              inputs,
+              attributes,
+              failure_messages,
+              nullptr,
+              convert_tensors_to_nums)) {
+        return packOutputs(graph, *result);
+      }
+    }
   }
 
   // none of the options worked
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
index 47ac4668ca83b2..61261a352d456e 100644
--- a/torch/csrc/jit/script/module.cpp
+++ b/torch/csrc/jit/script/module.cpp
@@ -37,8 +37,15 @@ const FunctionSchema& Method::getSchema() const {
   return *schema;
 }
 
-std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs) {
-  JIT_ASSERT(!executor);
+at::optional<std::vector<Value*>> try_emit_call_to(
+    Graph& graph,
+    SourceRange loc,
+    Method& callee,
+    ArrayRef<NamedValue> args,
+    ArrayRef<NamedValue> kwargs,
+    std::stringstream& failure_messages,
+    Method* caller,
+    bool conv_tensors_to_nums) {
   try {
     callee.ensure_defined();
   } catch (RecursiveMethodCallError&) {
@@ -47,19 +54,38 @@ std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, Array
   }
   auto fn = callee.graph();
 
-  std::stringstream failure_messages;
   auto matched_schema = tryMatchSchema(
     callee.getSchema(),
-    loc, *graph(), args, kwargs, failure_messages, /*conv_tensors_to_nums*/true);
+    loc, graph, args, kwargs, failure_messages, conv_tensors_to_nums);
   if(!matched_schema)
-    throw ErrorReport(loc) << failure_messages.str();
+    return at::nullopt;
 
   // parameters to callee method (which become parameters to _this_ method
   // if they were not already)
-  for(at::Tensor* member : callee.member_inputs) {
-    matched_schema->inputs.push_back(get_or_add_parameter(member));
+  for(at::Tensor* member : callee.params()) {
+    if(!caller) {
+      throw ErrorReport(loc) << " attempting to call a method with parameters from a raw graph. File a bug report";
+    }
+    matched_schema->inputs.push_back(caller->get_or_add_parameter(member));
+  }
+  return inlineCallTo(graph, *callee.graph(), matched_schema->inputs);
+}
+
+std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs) {
+  JIT_ASSERT(!executor);
+  std::stringstream failure_messages;
+  if (auto result = try_emit_call_to(
+          *graph(),
+          loc,
+          callee,
+          args,
+          kwargs,
+          failure_messages,
+          this,
+          /*conv_tensors_to_nums=*/true)) {
+    return *result;
   }
-  return inlineCallTo(*graph(), *callee.graph(), matched_schema->inputs);
+  throw ErrorReport(loc) << failure_messages.str();
 }
 
 void Method::ensure_defined() {
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 2bb4029d83e464..cafd084ce2cca3 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -85,6 +85,7 @@ struct Method {
 
   // defined here to keep details of member_input handling confined to this class
   std::vector<Value*> emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs);
+
   // if this isn't yet defined, run its method_creator function
   void ensure_defined();
 
@@ -393,4 +394,18 @@ struct Module {
   bool optimize;
 };
 
+// returns at::nullopt and fills in failure_messages if the callee does not
+// match the functions schema
+at::optional<std::vector<Value*>> try_emit_call_to(
+    Graph& graph,
+    SourceRange loc,
+    Method& callee,
+    ArrayRef<NamedValue> args,
+    ArrayRef<NamedValue> kwargs,
+    std::stringstream& failure_messages,
+    // when callee uses no parameters (e.g. it is a function in a compilation unit,
+    // and not a method), then nullptr can be passed as caller.
+    Method* caller,
+    bool conv_tensors_to_nums);
+
 }}}

From 8009b6cdb54602053ca68e6a5abc6bd80e35530b Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Fri, 28 Sep 2018 11:00:42 -0700
Subject: [PATCH 45/82] Kill self_ty in TYPE_DERIVED_DEFINITION_NATIVE (#11903)

Summary:
This allows us to call the type argument with name other than `self_ty`. ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11903

Differential Revision: D10105029

Pulled By: SsnL

fbshipit-source-id: 0fbdc728123ebc1154d080628cb41a085ba3e6d7
---
 aten/src/ATen/function_wrapper.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 189cadf0b6d1c6..1955d07b630d74 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -107,16 +107,10 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 # NB: As far as ezyang can tell, we don't *have* to codegen this,
 # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in
 # the superclass.  But it doesn't seem to be harmful.
-#
-# TODO: self_ty is a hack to make things work for native methods which need to
-# take a dtype, but also need to dispatch differently for different types.
-# Eliminate it at some point.
 TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     ${device_guard_declaration}
-    const auto& self_ty = *this;
-    (void)self_ty;
-    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals});
+    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${type_derived_call_actuals});
 }
 """)
 TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\
@@ -1574,8 +1568,15 @@ def process_native(option):
                         TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env))
                 else:
                     option['native_type_method_dispatch'] = native_dispatch
+                    type_derived_call_actuals = []
+                    for actual, arg in zip(option['actuals'], option['arguments']):
+                        if arg.get('is_type_dispatched', False):
+                            type_derived_call_actuals.append('*this')
+                        else:
+                            type_derived_call_actuals.append(actual)
                     type_object_definitions.append(
-                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(env))
+                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(
+                            env, type_derived_call_actuals=type_derived_call_actuals))
 
     for declaration in declarations:
         for option in declaration['options']:

From ab9a5976a025b3a56cfc64b86981ad52f8fe4137 Mon Sep 17 00:00:00 2001
From: Aditya Kumar <aditya7@fb.com>
Date: Fri, 28 Sep 2018 11:20:40 -0700
Subject: [PATCH 46/82] Disable inlinining of EnforceFailMessage (#12078)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12078

The constructor is inlined multiple times

Reviewed By: salexspb

Differential Revision: D9358084

fbshipit-source-id: c8d4177a3fcccac574ee4f63336a6fa8bfb07d11
---
 caffe2/core/logging.cc | 5 +++++
 caffe2/core/logging.h  | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index beaee7b15f9ff0..30603888ad1c44 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -13,6 +13,11 @@ CAFFE2_DEFINE_bool(caffe2_use_fatal_for_enforce, false,
                    "of throwing an exception.");
 
 namespace caffe2 {
+namespace enforce_detail {
+/* implicit */ EnforceFailMessage::EnforceFailMessage(std::string&& msg) {
+  msg_ = new std::string(std::move(msg));
+}
+} // namespace enforce_detail
 
 size_t ReplaceAll(string& s, const char* from, const char* to) {
   CAFFE_ENFORCE(from && *from);
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index 288c34afd5dbe7..859ee4765683a1 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -187,9 +187,8 @@ class CAFFE2_API EnforceFailMessage {
         "like `Equals`. Use CAFFE_ENFORCE for simple boolean checks.");
   }
 
-  /* implicit */ EnforceFailMessage(std::string&& msg) {
-    msg_ = new std::string(std::move(msg));
-  }
+  /* implicit */ EnforceFailMessage(std::string&& msg);
+
   inline bool bad() const {
     return msg_ != nullptr;
   }

From 0e779c27e1f07dcb1b8fa75cc14b0e220e9ddd3e Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 28 Sep 2018 13:52:19 -0700
Subject: [PATCH 47/82] Deduplicate canonical_axis_index_ with maybe_wrap_dim
 (#11891)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11891

maybe_wrap_dim is a slightly more general function, which is able
to, under some circumstances, treat 0 as a "valid" dimension even
with a tensor is scalar.  canonical_axis_index_ never accepts
this behavior, so it always passes false.

Reviewed By: jerryzh168

Differential Revision: D9968320

fbshipit-source-id: 13c98fff0880d7bfcd00911a76c8aa10d37bd183
---
 aten/src/ATen/core/TensorImpl.h     | 11 +----------
 aten/src/ATen/core/WrapDimMinimal.h |  6 ++++++
 caffe2/core/tensor_impl.h           |  2 +-
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index d16e5a83cb6683..f685aa9d93e377 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -10,6 +10,7 @@
 #include "ATen/core/LegacyTypeDispatch.h"
 #include "ATen/core/Backend.h"
 #include "ATen/core/context_base.h"
+#include "ATen/core/WrapDimMinimal.h"
 
 #include "caffe2/core/allocator.h"
 #include "caffe2/core/common.h"
@@ -89,16 +90,6 @@ inline int64_t size_between_dim_(int k, int l, IntList dims) {
   return r;
 }
 
-// Wrap around axis_index if it is negative, s.t., -1 is the last dim
-inline int canonical_axis_index_(int axis_index, int ndims) {
-  CAFFE_ENFORCE_GE(axis_index, -ndims);
-  CAFFE_ENFORCE_LT(axis_index, ndims);
-  if (axis_index < 0) {
-    return axis_index + ndims;
-  }
-  return axis_index;
-}
-
 /**
  * The low-level representation of a tensor, which contains a storage
  * (which contains the actual data) and metadata (e.g., sizes and strides)
diff --git a/aten/src/ATen/core/WrapDimMinimal.h b/aten/src/ATen/core/WrapDimMinimal.h
index 6971bac0b3f67c..859c1da0590a9d 100644
--- a/aten/src/ATen/core/WrapDimMinimal.h
+++ b/aten/src/ATen/core/WrapDimMinimal.h
@@ -20,4 +20,10 @@ static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wr
   return dim;
 }
 
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+// This is the "Caffe2" name
+static inline int canonical_axis_index_(int axis_index, int ndims) {
+  return maybe_wrap_dim(axis_index, ndims, false);
+}
+
 }
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 17d3b22083bbb1..2ee51f655e1e22 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -3,7 +3,7 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/TensorImpl.h>
 #include <ATen/core/context_base.h>
-#include <ATen/core/context_base.h>
+#include <ATen/core/WrapDimMinimal.h>
 
 namespace caffe2 {
   using at::ToVectorint64_t;

From 7b2c0a09e4795bb8dc06c3d1881289fbe75d84e2 Mon Sep 17 00:00:00 2001
From: mruberry <mruberry@nvidia.com>
Date: Fri, 28 Sep 2018 13:55:42 -0700
Subject: [PATCH 48/82] Adds support for NaN, +inf, -inf float scalars to CPU
 and CUDA fusers (#12070)

Summary:
In current upstream float scalars are always written into kernels with:

`out << std::scientific << v << "f";`

When the floats are special values like NaN, +inf, or -inf this produces nonsense that causes compilation to fail. This fix updates the conversion of float scalars to device-specific special values. The appropriate macros are added to the CPU and CUDA resource strings. Note that a NAN macro was not necessary on the CPU since math.h defines NAN.

To verify this fix I updated the test_clamp_fusion test in test_jit.py. I wanted to test -inf, too, but -inf is not currently accepted by the interpreter.

Edit:

Forgot to mention, this partially addresses issue #12067.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12070

Reviewed By: ezyang

Differential Revision: D10044704

Pulled By: soumith

fbshipit-source-id: 8f4a930862d66a7d37d985e3f6a6fb724579e74c
---
 test/test_jit.py                              | 17 +++++++++++------
 torch/csrc/jit/fusers/common/fused_kernel.cpp | 16 +++++++++++++++-
 torch/csrc/jit/fusers/cpu/resource_strings.h  |  4 ++++
 torch/csrc/jit/fusers/cuda/resource_strings.h |  4 ++++
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 22e7a5f69b467c..0757f5fb1f7640 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -2724,18 +2724,23 @@ def func(a, b):
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     @skipIfRocm
     def test_clamp_fusion(self):
-        def func(a, b):
+        def func2(a, b):
             return torch.clamp(a + b, min=0, max=2)
 
+        def funcInf(a, b):
+            return torch.clamp(a + b, min=0, max=float('inf'))
+
         a = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
         b = torch.randn(4, 4, dtype=torch.float, device='cuda')
 
-        s = self.checkScript(func, (a, b))
-        self.assertAllFused(s.graph_for(a, b))
+        funcs = (func2, funcInf)
+        for f in funcs:
+            s = self.checkScript(f, (a, b))
+            self.assertAllFused(s.graph_for(a, b))
 
-        c = s(a, b)
-        c.sum().backward()
-        self.assertAllFused(backward_graph(s))
+            c = s(a, b)
+            c.sum().backward()
+            self.assertAllFused(backward_graph(s))
 
     def test_mul(self):
         def func(a, b):
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp
index 6095bb13748470..5718a656a7f520 100644
--- a/torch/csrc/jit/fusers/common/fused_kernel.cpp
+++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp
@@ -24,6 +24,7 @@
 #include <sstream>
 #include <cstdint>
 #include <vector>
+#include <cmath>
 
 namespace torch { namespace jit {
 
@@ -235,9 +236,22 @@ static std::string scalarValue(int64_t v) {
   return std::to_string(v);
 }
 
+// Note: The NAN, NEG_INFINITY and POS_INFINITY strings map to device-specific
+// implementations of these special values. These macros are found in the 
+// resource strings for each device.
 static std::string scalarValue(double v) {
   std::ostringstream out;
-  out << std::scientific << v << "f";
+  if (std::isnan(v)) {
+    out << "NAN";
+  } else if (std::isinf(v)) {
+    if (v < 0) {
+      out << "NEG_INFINITY";
+    } else {
+      out << "POS_INFINITY";
+    }
+  } else {
+    out << std::scientific << v << "f";
+  }
   return out.str();
 }
 
diff --git a/torch/csrc/jit/fusers/cpu/resource_strings.h b/torch/csrc/jit/fusers/cpu/resource_strings.h
index 60c1c0faaa4fea..59a92ccc19b740 100644
--- a/torch/csrc/jit/fusers/cpu/resource_strings.h
+++ b/torch/csrc/jit/fusers/cpu/resource_strings.h
@@ -11,6 +11,10 @@ Correct code for this case is generated, however, nvrtc does not know how to han
 so typedefs help it handle those cases*/
 
 auto type_declarations_template = CodeTemplate(R"(
+
+#define POS_INFINITY INFINITY
+#define NEG_INFINITY -INFINITY
+
 typedef ${IndexType} IndexType;
 template<typename T, size_t N>
 struct TensorInfo {
diff --git a/torch/csrc/jit/fusers/cuda/resource_strings.h b/torch/csrc/jit/fusers/cuda/resource_strings.h
index 0063288721d727..6278a4f239636c 100644
--- a/torch/csrc/jit/fusers/cuda/resource_strings.h
+++ b/torch/csrc/jit/fusers/cuda/resource_strings.h
@@ -18,6 +18,10 @@ typedef long long int int64_t;
 ${HalfHeader}
 ${RandHeader}
 
+#define NAN __int_as_float(0x7fffffff)
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define NEG_INFINITY __int_as_float(0xff800000)
+
 typedef ${IndexType} IndexType;
 template<typename T, size_t N>
 struct TensorInfo {

From 60061a20d95751c11d9d1083defa097d74a1894f Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Fri, 28 Sep 2018 14:06:08 -0700
Subject: [PATCH 49/82] Adding Declare and Export operators (#11954)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11954

Adding an alternative to external_input and external_output for use in some distributed settings

Reviewed By: aazzolini

Differential Revision: D9997121

fbshipit-source-id: 1b5cc03fd3051368a3edc69e7bc472386f5746b5
---
 caffe2/opt/converter.cc                | 44 ++++++++++++++++++++++++++
 caffe2/opt/converter.h                 |  3 ++
 caffe2/opt/converter_nomigraph_test.cc | 34 ++++++++++++++++++++
 3 files changed, 81 insertions(+)

diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 6a7520716032bc..46fd8349b05832 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -519,4 +519,48 @@ caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m, const caffe2::NetDef& old
   return predictNet;
 }
 
+void pushOpToFront(caffe2::OperatorDef& op, caffe2::NetDef* net) {
+  *net->add_op() = op;
+  google::protobuf::RepeatedPtrField<caffe2::OperatorDef>* op_list(
+      net->mutable_op());
+  // Reverse iterate, swapping new element in front each time
+  for (int i(net->op_size() - 1); i > 0; --i) {
+    op_list->SwapElements(i, i - 1);
+  }
+}
+
+void injectDataEdgeIndicators(caffe2::NetDef* net) {
+  for (const auto& input : net->external_input()) {
+    caffe2::OperatorDef op;
+    op.set_type("Declare");
+    op.add_output(input);
+    pushOpToFront(op, net);
+  }
+  for (const auto& output : net->external_output()) {
+    caffe2::OperatorDef op;
+    op.set_type("Export");
+    op.add_input(output);
+    *net->add_op() = op;
+  }
+  net->clear_external_input();
+  net->clear_external_output();
+}
+
+void removeDataEdgeIndicators(caffe2::NetDef* net) {
+  google::protobuf::RepeatedPtrField<caffe2::OperatorDef>* op_list(
+      net->mutable_op());
+  for (auto i = 0; i < net->op_size(); ++i) {
+    auto op = net->op(i);
+    if (op.type() == "Declare") {
+      net->add_external_input(op.output(0));
+    } else if (op.type() == "Export") {
+      net->add_external_output(op.input(0));
+    } else {
+      continue;
+    }
+    // Note that this compensates for modifying the list inplace
+    op_list->DeleteSubrange(i--, 1);
+  }
+}
+
 } // namespace caffe2
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index f5933313c739ed..be0901ac64a038 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -13,6 +13,9 @@
 
 namespace caffe2 {
 
+void injectDataEdgeIndicators(caffe2::NetDef* net);
+void removeDataEdgeIndicators(caffe2::NetDef* net);
+
 CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false);
 CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
 
diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc
index 995c9a5961c800..e9da69a42dbe3c 100644
--- a/caffe2/opt/converter_nomigraph_test.cc
+++ b/caffe2/opt/converter_nomigraph_test.cc
@@ -98,3 +98,37 @@ TEST(Converter, ExternalOutputs) {
     EXPECT_EQ(new_netdef.external_output(i), net.external_output(i));
   }
 }
+
+TEST(Converter, InjectDataEdgeIndicators) {
+  auto net = fakeNet();
+  caffe2::injectDataEdgeIndicators(&net);
+
+  EXPECT_EQ(net.op_size(), 3 + 1 + 2); // Inserted 1 Declare and 2 Export
+
+  auto declare_count = 0;
+  auto export_count = 0;
+  for (const auto& op : net.op()) {
+    declare_count += op.type() == "Declare";
+    export_count += op.type() == "Export";
+  }
+  EXPECT_EQ(declare_count, 1);
+  EXPECT_EQ(export_count, 2);
+
+  // Remove them from the network
+  EXPECT_EQ(net.external_input_size(), 0);
+  EXPECT_EQ(net.external_output_size(), 0);
+
+  // Ensure nomnigraph can handle this change
+  auto nn = caffe2::convertToNNModule(net);
+  auto new_net = caffe2::convertToCaffe2Proto(nn);
+
+  caffe2::removeDataEdgeIndicators(&new_net);
+
+  for (const auto& op : new_net.op()) {
+    EXPECT_NE(op.type(), "Declare");
+    EXPECT_NE(op.type(), "Export");
+  }
+
+  EXPECT_EQ(new_net.external_input_size(), 1);
+  EXPECT_EQ(new_net.external_output_size(), 2);
+}

From 08e5ca126241378e7a51ce25fd5eeb9cbd75ac91 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Fri, 28 Sep 2018 14:06:10 -0700
Subject: [PATCH 50/82] Add filter<T>(NNModule) and explicit Declare/Export
 classes (#11955)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11955

Adding a `filter<T>(NNModule)` function to easily get inputs/outputs of a DAI-style NNModule.

Reviewed By: duc0

Differential Revision: D9997696

fbshipit-source-id: 818c4f2e3093e0d02b35e6632b426e8d3189c21e
---
 .../include/nomnigraph/Representations/NeuralNet.h           | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index e7a889980365c5..523f29225aa07b 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -354,6 +354,11 @@ std::vector<typename G::NodeRef> nodeIterator(G& g) {
   return out;
 }
 
+template <typename T>
+inline std::vector<NNGraph::NodeRef> filter(NNModule& nn) {
+  return nodeIterator<T>(nn.dataFlow);
+}
+
 template <typename T, typename G>
 std::vector<std::pair<T*, typename G::NodeRef>> dataIterator(G& g) {
   std::vector<std::pair<T*, typename G::NodeRef>> out;

From 0a5dfa5a52c0107619035854caeab78cb4ba8a97 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Fri, 28 Sep 2018 14:06:12 -0700
Subject: [PATCH 51/82] Add support for device annotations on blobs

Summary: device annotations on blobs with Declare and Export trick

Reviewed By: yyetim

Differential Revision: D9999916

fbshipit-source-id: 0bd4d15e7beed2788f47255d52ea296f8f674295
---
 caffe2/opt/annotations.cc | 21 +++++++++++++++++++++
 caffe2/opt/annotations.h  |  5 +++++
 2 files changed, 26 insertions(+)

diff --git a/caffe2/opt/annotations.cc b/caffe2/opt/annotations.cc
index 937fb789cce125..271ce3dcc4c61b 100644
--- a/caffe2/opt/annotations.cc
+++ b/caffe2/opt/annotations.cc
@@ -27,6 +27,27 @@ caffe2::OperatorDef* Caffe2Annotation::getMutableOperatorDef() {
 }
 
 // Distributed annotations
+void Caffe2Annotation::setDeviceOption(const caffe2::DeviceOption& devOpt) {
+  *OpDef.mutable_device_option() = devOpt;
+}
+
+bool Caffe2Annotation::hasDeviceOption() const {
+  return OpDef.has_device_option();
+}
+
+const caffe2::DeviceOption& Caffe2Annotation::getDeviceOption() const {
+  CAFFE_ENFORCE(
+      hasDeviceOption(),
+      "DeviceOption was never set.  Use Caffe2Annotation::setDeviceOption.");
+  return OpDef.device_option();
+}
+caffe2::DeviceOption* Caffe2Annotation::getMutableDeviceOption() {
+  CAFFE_ENFORCE(
+      hasDeviceOption(),
+      "DeviceOption was never set.  Use Caffe2Annotation::setDeviceOption.");
+  return OpDef.mutable_device_option();
+}
+
 void Caffe2Annotation::setDevice(std::string device) {
   Device = device;
 }
diff --git a/caffe2/opt/annotations.h b/caffe2/opt/annotations.h
index e143c5e960c542..9bc1f1e3137648 100644
--- a/caffe2/opt/annotations.h
+++ b/caffe2/opt/annotations.h
@@ -19,6 +19,11 @@ class CAFFE2_API Caffe2Annotation : public nom::repr::Annotation {
   const caffe2::OperatorDef& getOperatorDef() const;
   caffe2::OperatorDef* getMutableOperatorDef();
 
+  void setDeviceOption(const caffe2::DeviceOption& opDef);
+  bool hasDeviceOption() const;
+  const caffe2::DeviceOption& getDeviceOption() const;
+  caffe2::DeviceOption* getMutableDeviceOption();
+
   // Distributed annotations
   void setDevice(std::string device);
   const std::string getDevice() const;

From ebc26434981cf2e167b69d727354273470b8b71a Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Fri, 28 Sep 2018 17:14:19 -0700
Subject: [PATCH 52/82] Enable multiple external output (#10957)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10957

att

Differential Revision: D9525097

fbshipit-source-id: 0cea9469cea06cbfd3828549b168483413788269
---
 .../mobile/contrib/ios/mpscnn/mpscnn_graph.mm | 60 ++++++++++++++++---
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
index b2945d58a1039e..843c9d89cccb68 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
@@ -46,6 +46,34 @@ Analysis analyzeNet(const NetDef& net) {
   return analysis;
 }
 
+static void rewriteInput(OperatorDef* op, int i) {
+  auto input = op->input(i);
+  op->set_input(i, input + "_I");
+}
+
+static void rewriteOutput(OperatorDef* op, int i) {
+  auto output = op->output(i);
+  op->set_output(i, output + "_M");
+}
+
+static void insertInputCopyToMPSCNNOp(
+    NetDef& predictNet,
+    const std::string& cpu_blob) {
+  auto* op = predictNet.add_op();
+  op->set_type("CopyToMPSCNN");
+  op->add_input(cpu_blob);
+  op->add_output(cpu_blob + "_I");
+}
+
+static void insertOutputCopyFromMPSCNNOp(
+    NetDef& predictNet,
+    const std::string& cpu_blob) {
+  auto* op = predictNet.add_op();
+  op->set_type("CopyFromMPSCNN");
+  op->add_input(cpu_blob + "_M");
+  op->add_output(cpu_blob);
+}
+
 NetDef insertInputOutputCopyOps(const NetDef& def) {
   // Do some validation of the outputs. For this version, we require:
   // - a single input (first element of external_input()) is consumed by the
@@ -82,6 +110,8 @@ NetDef insertInputOutputCopyOps(const NetDef& def) {
     op.add_output("__METAL_INPUT_COPY__");
   }
 
+  std::unordered_set<std::string> output_set;
+
   for (auto i = 0; i < def.op_size(); ++i) {
     const auto& ogOp = def.op(i);
     auto op = mdef.add_op();
@@ -90,17 +120,29 @@ NetDef insertInputOutputCopyOps(const NetDef& def) {
       CAFFE_ENFORCE_EQ(op->input(0), def.external_input(0));
       op->set_input(0, "__METAL_INPUT_COPY__");
     }
-    if (i == def.op_size() - 1) {
-      CAFFE_ENFORCE_EQ(op->output(0), def.external_output(0));
-      op->set_output(0, "__METAL_OUTPUT_COPY__");
+    // rewrite input
+    for (auto j = 0; j < op->input_size(); ++j) {
+      if (output_set.find(op->input(j)) != output_set.end()) {
+        insertInputCopyToMPSCNNOp(mdef, op->input(j));
+        rewriteInput(op, j);
+      }
+    }
+
+    // if the output is in external output, copy from metal when necessary
+    for (auto j = 0; j < op->output_size(); ++j) {
+      for (auto k = 0; k < def.external_output_size(); ++k) {
+        // Assuming external output blob has unique name, e.g. only version 0
+        // of the blob is used as the output
+        if (op->output(j) == def.external_output(k)) {
+          output_set.insert(op->output(j));
+          insertOutputCopyFromMPSCNNOp(mdef, op->output(j));
+          // rewrite output to output_M for the operator
+          rewriteOutput(op, j);
+        }
+      }
     }
   }
-  {
-    auto& op = *(mdef.add_op());
-    op.set_type("CopyFromMPSCNN");
-    op.add_input("__METAL_OUTPUT_COPY__");
-    op.add_output(def.external_output(0));
-  }
+
   return mdef;
 }
 

From 22ce6060ecb5a3fa75c7af59159836c113ad0f57 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Fri, 28 Sep 2018 17:16:53 -0700
Subject: [PATCH 53/82] Add caffe2_api to exported functions (#12184)

Summary:
Broke the build, sorry.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12184

Differential Revision: D10114818

Pulled By: bwasti

fbshipit-source-id: 49844183a48d9383c5055a9ce06fe61fbf353050
---
 caffe2/opt/converter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index be0901ac64a038..9666739d14f016 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -13,8 +13,8 @@
 
 namespace caffe2 {
 
-void injectDataEdgeIndicators(caffe2::NetDef* net);
-void removeDataEdgeIndicators(caffe2::NetDef* net);
+CAFFE2_API void injectDataEdgeIndicators(caffe2::NetDef* net);
+CAFFE2_API void removeDataEdgeIndicators(caffe2::NetDef* net);
 
 CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false);
 CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);

From 878e7740fdfd5538719eefdc778bbbc0284027f2 Mon Sep 17 00:00:00 2001
From: mruberry <mruberry@nvidia.com>
Date: Fri, 28 Sep 2018 19:30:55 -0700
Subject: [PATCH 54/82] Turns optimizations off when checking trace (#12172)

Summary:
Currently when tracing optimizations are performed twice. This means that optimizing passes, like the fusion pass, are also called twice. This is unnecessary and this PR turns off optimizations when checking the trace (since the trace is independent of optimizations). This should improve performance and debugging.

apaszke who proposed this change.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12172

Reviewed By: ezyang

Differential Revision: D10109250

Pulled By: apaszke

fbshipit-source-id: 8b3385eae143446820f1b61ca7576d7c07f9b248
---
 torch/jit/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 21553e4aca9915..5fd90b5fd95382 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -365,6 +365,8 @@ def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None):
 # Check the traced module against a set of user-provided validation inputs
 @torch.no_grad()
 def _check_trace(check_inputs, func, executor_options, module, check_tolerance):
+    # Note: tracing is independent of optimizations, which consume the trace
+    executor_options['optimize'] = False
     for inputs in check_inputs:
         if isinstance(inputs, torch.Tensor):
             inputs = (inputs,)

From a2ebbccc9f04abc89c37e3227355858d668ac000 Mon Sep 17 00:00:00 2001
From: iotamudelta <dieterich@ogolem.org>
Date: Fri, 28 Sep 2018 23:06:06 -0700
Subject: [PATCH 55/82] fix unit tests on CI

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12187

Differential Revision: D10118483

Pulled By: bddppq

fbshipit-source-id: 986c8fb48d61e00103c713548a50e74489a0e442
---
 aten/src/ATen/cuda/CUDAContext.cpp | 14 ++++++--------
 aten/src/ATen/cuda/CUDAContext.h   |  2 --
 test/test_autograd.py              |  1 +
 test/test_cuda.py                  | 10 +++++-----
 test/test_dataloader.py            |  1 +
 test/test_jit.py                   |  3 +++
 test/test_nn.py                    |  1 +
 test/test_sparse.py                |  3 +++
 test/test_torch.py                 |  5 +++++
 9 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 58248acfe17951..0a4649d9c41ad4 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -54,15 +54,13 @@ Allocator* getCUDADeviceAllocator() {
 }
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
-  cusparseHandle_t getCurrentCUDASparseHandle() {
-    return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
-  }
+cusparseHandle_t getCurrentCUDASparseHandle() {
+  return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
+}
 
-  cublasHandle_t getCurrentCUDABlasHandle() {
-    return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
-  }
-#endif
+cublasHandle_t getCurrentCUDABlasHandle() {
+  return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
+}
 
 } // namespace cuda
 
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index 83a890da4d535e..3a480d2ca4e4e3 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -59,10 +59,8 @@ CAFFE2_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
 CAFFE2_API Allocator* getCUDADeviceAllocator();
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
 CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
 CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
-#endif
 
 
 } // namespace cuda
diff --git a/test/test_autograd.py b/test/test_autograd.py
index f9ccfb6c958e99..0642e87399c676 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1406,6 +1406,7 @@ def test_unused_output(self):
         expected_grad[:2] = grad_output
         self.assertEqual(x.grad.data, expected_grad)
 
+    @skipIfRocm
     def test_ctc_loss(self):
         batch_size = 64
         num_labels = 101
diff --git a/test/test_cuda.py b/test/test_cuda.py
index cdf8d46ce236cf..2c647b08cbd601 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -268,11 +268,11 @@ def tmp(t):
     ('div', small_3d, lambda t: [number(3.14, 3, t)], '', types, False,
         "skipIfRocm:ByteTensor,CharTensor,FloatTensor,HalfTensor,ShortTensor"),
     ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types, False, "skipIfRocm:HalfTensor"),
+    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
+    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1'),
+    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2'),
+    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3'),
+    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
     # HalfTensor gives bad result at pow-2 with data sampled from torch.randn
     ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half,
         False, "skipIfRocm:HalfTensor,FloatTensor"),
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 020486c1fbda35..3d9af20c859658 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -371,6 +371,7 @@ def test_segfault(self):
         finally:
             p.terminate()
 
+    @skipIfRocm
     def test_timeout(self):
         p = ErrorTrackingProcess(target=_test_timeout)
         p.start()
diff --git a/test/test_jit.py b/test/test_jit.py
index 0757f5fb1f7640..89aec6001f4582 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -7248,6 +7248,7 @@ def test_dcgan_models(self):
         self._test_dcgan_models(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_dcgan_models_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_dcgan_models(self, device='cuda', check_export_import=False)
@@ -7370,11 +7371,13 @@ def test_mnist(self):
         self._test_mnist(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_mnist(self, device='cuda', check_export_import=False)
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_training_leaks_no_memory_cuda(self):
         net = MnistNet().cuda()
         # MnistNet uses dropout, don't check its trace
diff --git a/test/test_nn.py b/test/test_nn.py
index 0d61d72f3ceb66..eee4e3a7c74755 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4202,6 +4202,7 @@ def get_inputs(input_shape, hidden_shape, mode):
             test(input_shape, hidden_shape, mode)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_rnn_check_device(self):
         input_size = 3
         hidden_size = 5
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 0e91dca37d4c3f..f95d7256c4042b 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1033,6 +1033,7 @@ def _all_narrow_combs(self, shape):
                 for length in range(dim_sz - start):
                     yield [dim, start, length]
 
+    @skipIfRocm
     def test_narrow(self):
         shape = [3, 3, 4, 2]
         input, _, _ = self._gen_sparse(4, 19, shape)
@@ -1437,6 +1438,7 @@ def test_tensor(indices, values, indices_equal, values_equal):
         test_tensor(indices, values, False, True)  # An empty tensor's data_ptr is always equal to 0
 
     @cpu_only  # just run once, we test both cpu and cuda
+    @skipIfRocm
     def test_constructor_device_legacy(self):
         i = torch.tensor([[0, 1, 1], [2, 0, 2]])
         v = torch.tensor([3., 4., 5.])
@@ -1583,6 +1585,7 @@ def test_resize(self):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 0], [2, 2, 0])
 
+    @skipIfRocm
     def test_is_nonzero(self):
         self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero())
diff --git a/test/test_torch.py b/test/test_torch.py
index 84ef8a22e050b3..3026548b99043e 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3999,6 +3999,7 @@ def test_is_signed_cuda(self):
         self.assertEqual(torch.cuda.HalfTensor(10).is_signed(), True)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_gesv(self):
         a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
                           (-6.05, -3.30, 5.36, -4.44, 1.08),
@@ -4130,6 +4131,7 @@ def test_gesv_batched_dims(self):
         self._test_gesv_batched_dims(self, lambda t: t)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_qr(self):
 
         # Since the QR decomposition is unique only up to the signs of the rows of
@@ -4312,10 +4314,12 @@ def _test_trtrs(self, cast):
         self.assertEqual(res1, tb, 0)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_trtrs(self):
         self._test_trtrs(self, lambda t: t)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_gels(self):
         def _test_underdetermined(a, b, expectedNorm):
             m = a.size()[0]
@@ -4431,6 +4435,7 @@ def check_norm(a, b, expected_norm, gels_result):
         self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_eig(self):
         a = torch.Tensor(((1.96, 0.00, 0.00, 0.00, 0.00),
                           (-6.49, 3.80, 0.00, 0.00, 0.00),

From 40aa212cd65c2852d35ce3e43c731d63599aefbb Mon Sep 17 00:00:00 2001
From: Jiyan Yang <chocjy@fb.com>
Date: Sat, 29 Sep 2018 21:43:14 -0700
Subject: [PATCH 56/82] Support fp16 mkl engine in training

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12080

Reviewed By: hyuen

Differential Revision: D10037719

fbshipit-source-id: 618ce894eccc4c87a038dc3ab836684f16843cde
---
 caffe2/python/optimizer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 5454b8cd4d3fba..ddd5871f7d4b74 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -22,6 +22,8 @@
 AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
 _optimizer_instance_count = defaultdict(int)
 
+FP16_ENGINES = ["SIMD_Q_FP16", "SIMD_Q_STOC_FP16", "SIMD_Q_STOC_MKL_FP16"]
+
 logger = logging.getLogger(__name__)
 
 
@@ -584,7 +586,7 @@ def _run(self, net, param_init_net, param_info):
                     value=0.0
                 )
         else:
-            if self.engine == "SIMD_Q_FP16" or self.engine == "SIMD_Q_STOC_FP16":
+            if self.engine in FP16_ENGINES:
                 shapes, types = workspace.InferShapesAndTypes([param_init_net])
                 assert str(param) in shapes, shapes
                 shape = shapes[str(param)]

From 5ffc915f26a7759d3a24b692599a49db2ef6b0c0 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Sat, 29 Sep 2018 22:23:36 -0700
Subject: [PATCH 57/82] fix docs (#12126)

Summary:
- fix https://github.com/pytorch/pytorch/issues/12120
- add `torch.argsort`, `torch.pdist`, `broadcast_tensors` to *.rst files
- add parameter dim to `torch.unique` doc
- fix table and args for `torch.norm`
- test plan: make html and check docs in browser

gchanan
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12126

Differential Revision: D10087006

Pulled By: weiyangfb

fbshipit-source-id: 25f65c43d14e02140d0da988d8742c7ade3d8cc9
---
 docs/source/nn.rst       |  5 +++++
 docs/source/torch.rst    |  2 ++
 torch/_torch_docs.py     |  6 +++---
 torch/functional.py      | 26 +++++++++++++++-----------
 torch/nn/modules/conv.py | 10 +++++-----
 5 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 68420d837bf801..a0d3abfa5501ad 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -1173,6 +1173,11 @@ Distance functions
 
 .. autofunction:: cosine_similarity
 
+:hidden:`pdist`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pdist
+
 
 Loss functions
 --------------
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 31585d4a969770..1d55fa8b937738 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -223,6 +223,7 @@ Reduction Ops
 Comparison Ops
 ~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: allclose
+.. autofunction:: argsort
 .. autofunction:: eq
 .. autofunction:: equal
 .. autofunction:: ge
@@ -256,6 +257,7 @@ Spectral Ops
 Other Operations
 ~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: bincount
+.. autofunction:: broadcast_tensors
 .. autofunction:: cross
 .. autofunction:: diag
 .. autofunction:: diagflat
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 7601ce3c268d4c..44b963f38d3bb8 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1210,7 +1210,7 @@ def parse_kwargs(desc):
 Computes the logarithmic derivative of the gamma function on `input`.
 
 .. math::
-    \psi(x) = \frac{d}{dx} \ln\left(\Gamma\left(x\right)\right) = \frac{\Gamma'(x)}{\Gamma(x)}
+    \psi(x) = \frac{d}{dx} \ln\left(\gamma\left(x\right)\right) = \frac{\gamma'(x)}{\gamma(x)}
 
 Args:
     input (Tensor): the tensor to compute the digamma function on
@@ -2936,9 +2936,9 @@ def parse_kwargs(desc):
 Computes the multivariate log-gamma function with dimension :math:`p` element-wise, given by
 
 .. math::
-    \log(\Gamma_{p}(a)) = C + \displaystyle \sum_{i=1}^{p} \log\left(\Gamma\left(a - \frac{i - 1}{2}\right)\right)
+    \log(\gamma_{p}(a)) = C + \displaystyle \sum_{i=1}^{p} \log\left(\gamma\left(a - \frac{i - 1}{2}\right)\right)
 
-where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\Gamma(.)` is the Gamma function.
+where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\gamma(.)` is the Gamma function.
 
 If any of the elements are less than or equal to :math:`\frac{p - 1}{2}`, then an error
 is thrown.
diff --git a/torch/functional.py b/torch/functional.py
index 0eac8f16741766..47625171c519e2 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -439,6 +439,8 @@ def unique(input, sorted=False, return_inverse=False, dim=None):
             before returning as output.
         return_inverse (bool): Whether to also return the indices for where
             elements in the original input ended up in the returned unique list.
+        dim (int): the dimension to apply unique. If ``None``, the unique of the
+            flattened input is returned. default: ``None``
 
     Returns:
         (Tensor, Tensor (optional)): A tensor or a tuple of tensors containing
@@ -646,8 +648,9 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None):
 
     Args:
         input (Tensor): the input tensor
-        p ({int, float, inf, -inf, 'fro', 'nuc'}): the order of norm
+        p (int, float, inf, -inf, 'fro', 'nuc'): the order of norm
             The following norms can be calculated:
+
             =====  ============================  ==========================
             ord    matrix norm                   vector norm
             =====  ============================  ==========================
@@ -656,18 +659,19 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None):
             'nuc'  nuclear norm                  --
             Other  as vec norm when dim is None  sum(abs(x)**ord)**(1./ord)
             =====  ============================  ==========================
-        dim ({int, 2-tuple of ints, 2-list of ints}, optional): If it is an int,
-        vector norm will be calculated, if it is 2-tuple of ints, matrix norm
-        will be calculated. If the value is None, matrix norm will be calculated
-        when the input tensor only has two dimensions, vector norm will be
-        calculated when the input tensor only has one dimension. If the input
-        tensor has more than two dimensions, the vector norm will be applied to
-        last dimension.
+
+        dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int,
+            vector norm will be calculated, if it is 2-tuple of ints, matrix norm
+            will be calculated. If the value is None, matrix norm will be calculated
+            when the input tensor only has two dimensions, vector norm will be
+            calculated when the input tensor only has one dimension. If the input
+            tensor has more than two dimensions, the vector norm will be applied to
+            last dimension.
         keepdim (bool): whether the output tensors have :attr:`dim`
-            retained or not. Ignored if attr:`dim`=``None`` and
-            :attr:`out`=``None``.
+            retained or not. Ignored if :attr:`dim` = ``None`` and
+            :attr:`out` = ``None``.
         out (Tensor, optional): the output tensor. Ignored if
-        attr:`dim`=``None`` and :attr:`out`=``None``.
+            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
 
     Example::
         >>> import torch
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index a1bfcbc08e097e..313bad93fea4bd 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -664,10 +664,10 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
         - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
 
-          .. math::
+        .. math::
               H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0]
                     + \text{kernel\_size}[0] + \text{output\_padding}[0]
-
+        .. math::
               W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1]
                     + \text{kernel\_size}[1] + \text{output\_padding}[1]
 
@@ -806,13 +806,13 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
         - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where
 
-          .. math::
+        .. math::
               D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0]
                     + \text{kernel\_size}[0] + \text{output\_padding}[0]
-
+        .. math::
               H_{out} = (H_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1]
                     + \text{kernel\_size}[1] + \text{output\_padding}[1]
-
+        .. math::
               W_{out} = (W_{in} - 1) \times \text{stride}[2] - 2 \times \text{padding}[2]
                     + \text{kernel\_size}[2] + \text{output\_padding}[2]
 

From 93ecf4d72ada7fd6cbcf9ec582ecb0e37344bc7c Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Sat, 29 Sep 2018 22:40:09 -0700
Subject: [PATCH 58/82] Remove raise_from (#12185)

Summary:
soumith

CC alsrgv

Fixes #11995
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12185

Differential Revision: D10120103

Pulled By: goldsborough

fbshipit-source-id: ef7807ad83f9efc05d169675b7ec72986a5d17c3
---
 torch/utils/cpp_extension.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 4b1c4cbc32bc09..eff79df6f29b84 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -10,8 +10,6 @@
 import tempfile
 import warnings
 
-from future.utils import raise_from
-
 import torch
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
@@ -858,7 +856,7 @@ def _build_extension_module(name, build_directory, verbose):
         message = "Error building extension '{}'".format(name)
         if hasattr(error, 'output') and error.output:
             message += ": {}".format(error.output.decode())
-        raise_from(RuntimeError(message), None)
+        raise RuntimeError(message)
 
 
 def _import_module_from_library(module_name, path):

From 572132fb1701e3c9a16dea50547355773a66a9a2 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Sun, 30 Sep 2018 11:45:17 -0700
Subject: [PATCH 59/82] copy_(Sparse, Sparse) for sparse tensor (#9005)

Summary:
- fix #8330
- add `torch.copy_(Sparse, Sparse)` with autograd support
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9005

Differential Revision: D8987885

Pulled By: weiyangfb

fbshipit-source-id: b317a41da22ee1eae2835622a0ed28a6771a3a06
---
 aten/src/ATen/native/native_functions.yaml    |  4 +-
 aten/src/ATen/native/sparse/SparseTensor.cpp  |  6 +-
 .../ATen/native/sparse/SparseTensorMath.cpp   |  4 +-
 aten/src/ATen/native/sparse/SparseUtils.h     |  4 +-
 .../sparse/cuda/SparseCUDATensorMath.cu       |  2 +-
 aten/src/ATen/templates/TypeDefault.cpp       | 21 ++----
 test/test_sparse.py                           | 68 +++++++++++++++++++
 tools/autograd/gen_python_functions.py        |  3 +-
 tools/autograd/templates/VariableType.cpp     |  4 +-
 9 files changed, 90 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9a7c56c3a499f1..b4ebdfb634e422 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2068,8 +2068,8 @@
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
 
-# This "raw copy" doesn't handle conversions NOR does it handle non-blocking.
-- func: raw_copy_sparse_(Tensor self, Tensor src) -> Tensor
+- func: copy_sparse_to_sparse_(Tensor self, Tensor src, bool non_blocking=false) -> Tensor
+  variants: function
   dispatch:
     SparseCPU: copy_sparse_
     SparseCUDA: copy_sparse_
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 83aee52cf81021..7e2340be24a10f 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -204,7 +204,7 @@ SparseTensor new_with_tensor_and_size_sparse(const LongTensor& indices, const Te
 
 SparseTensor clone_sparse(const SparseTensor& self) {
   SparseTensor other = new_with_dims_and_size_sparse(self.type(), self._sparseDims(), self._denseDims(), self.sizes());
-  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values());
+  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values(), true);
   _get_sparse_impl(other)->set_coalesced(self.is_coalesced());
   return other;
 }
@@ -243,11 +243,11 @@ Tensor sparse_to_dense(const SparseTensor& self) {
   return dst.add_(self);
 }
 
-SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src) {
+SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src, bool non_blocking) {
   if (isSameTensor(self, src)) return self;
   _get_sparse_impl(self)->resize_(src._sparseDims(), src._denseDims(), src.sizes());
   // NB: This seems to copy the underlying full indices/values buffer
-  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values());
+  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values(), non_blocking);
   _get_sparse_impl(self)->set_coalesced(src.is_coalesced());
   return self;
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 8a8668fc48b8a1..c71e38450974a6 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -98,7 +98,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
       r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
   }
   else {
-    r = raw_copy_sparse_(r, t.coalesce());
+    copy_sparse_to_sparse_(r, t.coalesce());
   }
   r._values().log1p_();
   return r;
@@ -192,7 +192,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
   AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
-    return raw_copy_sparse_(r, t);
+    return copy_sparse_to_sparse_(r, t);
   }
   if (t._nnz() == 0) {
     return mul_out_sparse_scalar(r, src, value);
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
index 2626eedebaf5e2..a0fbf4ea904cc4 100644
--- a/aten/src/ATen/native/sparse/SparseUtils.h
+++ b/aten/src/ATen/native/sparse/SparseUtils.h
@@ -50,8 +50,8 @@ inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indic
 
 // Take indices and values and makes a (data) copy of them to put into the sparse
 // indices/values.  This used to be called THSTensor_(_set)
-inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
-  _alias_into_sparse(self, indices.clone(), values.clone());
+inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {
+  _alias_into_sparse(self, self._indices().type().copy(indices, non_blocking), self._values().type().copy(values, non_blocking));
 }
 
 // Does NOT make copies of indices/values
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 036666bec82ac2..2abc10e62c3d46 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -363,7 +363,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const
   AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
-    return raw_copy_sparse_(r_, t);
+    return copy_sparse_to_sparse_(r_, t);
   }
   if (t._nnz() == 0) {
     return mul_out_sparse_scalar(r_, src, value);
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 0891f6d9f4f492..03309f8fe9eee3 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -18,7 +18,8 @@ namespace at {
 
 Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
   Tensor b_src;
-  std::tie(b_src) = expand_inplace(self, src, "copy");
+  if (is_sparse()) b_src = src;
+  else std::tie(b_src) = expand_inplace(self, src, "copy");
   return s_copy_(self, b_src, non_blocking);
 }
 
@@ -28,19 +29,11 @@ Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional<Device>
     device_guard.set_index(to_device.value().index());
   }
   AT_CHECK(src.defined(), "attempt to copy an undefined tensor");
-  if (is_sparse()) {
-    auto indices = src._indices();
-    auto values = src._values();
-    auto & this_dense = toBackend(is_cuda() ? Backend::CUDA : Backend::CPU);
-    auto & this_dense_idx = this_dense.toScalarType(ScalarType::Long);
-    auto indices_copy = this_dense_idx.copy(indices, non_blocking);
-    auto values_copy = this_dense.copy(values, non_blocking);
-    return _sparse_coo_tensor_unsafe(indices_copy, values_copy, src.sizes());
-  } else {
-    Tensor r = this->tensor(src.sizes());
-    r.copy_(src, non_blocking);
-    return r;
-  }
+  Tensor r;
+  if (is_sparse()) r = this->native_tensor();
+  else r = this->tensor(src.sizes());
+  r.copy_(src, non_blocking);
+  return r;
 }
 
 void TypeDefault::backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const {
diff --git a/test/test_sparse.py b/test/test_sparse.py
index f95d7256c4042b..831f0f746ae312 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -496,6 +496,74 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
         test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
+    def test_Sparse_to_Sparse_copy_(self):
+        # This is for testing torch.copy_(SparseTensor, SparseTensor)
+        sparse_dims = 3
+        nnz = 10
+        sizes = [2, 3, 4, 5]  # hybrid sparse
+        x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
+        x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes)
+
+        # test copy
+        x2_dense = x2.to_dense()
+        x1.copy_(x2)
+        self.assertEqual(x2_dense, x1.to_dense())
+
+        # test type conversion (when x1.copy_(x2), x1.dtype should stay the same)
+        x1 = x1.to(torch.float32)
+        x2 = x2.to(torch.float64)
+        x1_dtype = x1.dtype
+        x1.copy_(x2)
+        self.assertEqual(x1_dtype, x1.dtype)
+
+        # test no broadcast
+        self.assertRaises(RuntimeError, lambda: x1.copy_(x2.narrow_copy(0, 0, 1)))
+
+        # test raise error on copy_() between dense and sparse Tensors
+        self.assertRaises(RuntimeError, lambda: x1.copy_(torch.randn(5, 5)))
+
+        # test autograd
+        x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
+        x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes)
+        x2.requires_grad_(True)
+        x1.copy_(x2)
+        y = x1 * 2
+        x2_clone = x2.clone()
+        y.backward(x2_clone)
+        expected_grad = x2_clone * 2
+        self.assertEqual(expected_grad.to_dense(), x2.grad.to_dense())
+        self.assertEqual(None, x1.grad)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    def test_Sparse_to_Sparse_copy_multi_gpu(self):
+        # This is for testing torch.copy_(SparseTensor, SparseTensor) across GPU devices
+        sparse_dims = 3
+        nnz = 10
+        sizes = [2, 3, 4, 5]  # hybrid sparse
+        x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
+        x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes)
+        x1 = x1.to('cuda:0')
+
+        def test_cross_device(x1, x2):
+            x1_device = x1.device
+            x1.copy_(x2)
+            self.assertEqual(x2.to('cuda:0').to_dense(), x1.to_dense())
+            self.assertEqual(x1_device, x1.device)
+
+        test_cross_device(x1, x2.to('cuda:1'))  # test across gpu devices
+        test_cross_device(x1, x2.to('cpu'))  # test between cpu and gpu
+
+        # test autograd
+        x2 = x2.to('cuda:1')
+        x2.requires_grad_(True)
+        x1.copy_(x2)
+        y = x1 * 2
+        x2_clone = x2.clone().to('cuda:0')
+        y.backward(x2_clone)
+        expected_grad = x2_clone * 2
+        self.assertEqual(expected_grad.to_dense(), x2.grad.to('cuda:0').to_dense())
+        self.assertEqual(None, x1.grad)
+
     @cuda_only
     def test_cuda_empty(self):
         def test_tensor(x):
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 81856c62ad07d9..7e26d84432182c 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -28,7 +28,8 @@
     '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*',
     'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', 'randint(_out)?',
     '_local_scalar', '_local_scalar_dense',
-    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to'
+    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to',
+    'copy_sparse_to_sparse_'
 ]
 
 # These function signatures are not exposed to Python. Note that this signature
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 24ac92dd63926f..64ad9fc5e6d185 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -416,7 +416,9 @@ Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_block
       grad_fn->src_device = src.get_device();
     }
   }
-  baseType->s_copy_(self_, src_, non_blocking);
+  if (self.is_sparse() && src.is_sparse()) baseType->copy_sparse_to_sparse_(self_, src_, non_blocking);
+  else if (!self.is_sparse() && !src.is_sparse()) baseType->s_copy_(self_, src_, non_blocking);
+  else AT_ERROR("copy_() between dense and sparse Tensors is not implemented! Found self type = ", self.type(), " and src type = ", src.type());
   increment_version(self);
   rebase_history(as_variable_ref( self ), std::move(grad_fn));
   if(torch::jit::tracer::isTracing()) {

From c3817e85fabd59cb40a2c5b0a5c0ab5b5da3a01c Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Sun, 30 Sep 2018 15:35:16 -0700
Subject: [PATCH 60/82] Temporary fix for LibTorch download link (#12212)

Summary:
We're waiting for the libtorch links to show up on the website. I had a fake link in the docs so far which is misleading. This PR changes it to a temporary markdown file until the web people fix the site tomorrow.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12212

Differential Revision: D10121872

Pulled By: goldsborough

fbshipit-source-id: f1bd1315f7333b9168e99983f3f6b679c9b0c52a
---
 docs/cpp/source/installing.rst | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/cpp/source/installing.rst b/docs/cpp/source/installing.rst
index d304f8168838f0..24906dbb53391a 100644
--- a/docs/cpp/source/installing.rst
+++ b/docs/cpp/source/installing.rst
@@ -4,7 +4,7 @@ Installing C++ Distributions of PyTorch
 We provide binary distributions of all headers, libraries and CMake
 configuration files required to depend on PyTorch. We call this distribution
 *LibTorch*, and you can download ZIP archives containing the latest LibTorch
-distribution on `our website <https://pytorch.org/get-started/locally/>`_. Below
+distribution on `our website <https://gist.github.com/goldsborough/fc3d94917f0405a9da7ec2899710eb9f>`_. Below
 is a small example of writing a minimal application that depends on LibTorch
 and uses the `at::Tensor` class which comes with the PyTorch C++ API.
 
@@ -16,9 +16,8 @@ example:
 
 .. code-block:: sh
 
-  wget http://pytorch.org/libtorch/libtorch-latest.zip
-  unzip libtorch-latest.zip
-  ls -1R libtorch-latest
+  wget https://download.pytorch.org/libtorch/nightly/cpu/libtorch-shared-with-deps-latest.zip
+  unzip libtorch-shared-with-deps-latest.zip
 
 
 Next, we can write a minimal CMake build configuration to develop a small

From 9768b4d4ffee4ee96c5ef68d3c8203a105fa8747 Mon Sep 17 00:00:00 2001
From: Hector Yuen <hyz@fb.com>
Date: Sun, 30 Sep 2018 23:50:15 -0700
Subject: [PATCH 61/82] support half float for
 SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient (#12186)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12186

specialized implementation, preconvert embeddings to float and do everything on fp32

Reviewed By: jspark1105

Differential Revision: D10100603

fbshipit-source-id: 3255b4addb6fda24722bd519163099f5d354d084
---
 caffe2/operators/segment_reduction_op.cc | 1 +
 caffe2/operators/segment_reduction_op.h  | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc
index 4029a52d0c3587..482d0599fc0e2b 100644
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@@ -51,6 +51,7 @@ OPERATOR_SCHEMA(SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient)
 REGISTER_CPU_OPERATOR(
     SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient,
     AbstractLengthsWithMainInputGradientOp<
+        float,
         float,
         int,
         CPUContext,
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index 9e7ab6d604016c..4449613787e881 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -1616,6 +1616,7 @@ class AbstractLengthsGradientOp : public Operator<Context> {
 // Version of gradient that requires the main input and thus needs to receive
 // length, indices and other stuff
 template <
+    typename Tembedding,
     typename T,
     typename TLengths,
     class Context,
@@ -1689,8 +1690,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
     int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
-    const T* data = dataInput.template data<T>();
-
+    const Tembedding* data = dataInput.template data<Tembedding>();
     int64_t dataIndex = 0;
     for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
@@ -1945,6 +1945,7 @@ segments, i.e. len(*LENGTHS*).
   using BackwardOp =
       AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
+      T,
       T,
       SIndex,
       Context,
@@ -2048,6 +2049,7 @@ i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
       ReducerGradient,
       false /*GradientNeedIndices*/>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
+      T,
       T,
       SIndex,
       Context,

From f3c32a4b5499a188d8a6242b6c662bbba1d7521c Mon Sep 17 00:00:00 2001
From: Jongsoo Park <jongsoo@fb.com>
Date: Mon, 1 Oct 2018 09:27:38 -0700
Subject: [PATCH 62/82] dnnlowp_16 -> dnnlowp_acc16 (#12205)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12205

We're more interested in testing the performance of DNNLOWP_ACC16 engine.

Reviewed By: llyfacebook

Differential Revision: D10121080

fbshipit-source-id: 7def38be838feb7636f7dd0c8ed352c2df398ec1
---
 binaries/benchmark_helper.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index f481a6292c7f56..ecbae477282c15 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -81,8 +81,8 @@ void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
                                                 : backend == "cuda"
                     ? "CUDA"
                     : backend == "dnnlowp" ? "DNNLOWP"
-                                           : backend == "dnnlowp_16"
-                            ? "DNNLOWP_16"
+                                           : backend == "dnnlowp_acc16"
+                            ? "DNNLOWP_ACC16"
                             : backend == "default" ? "" : "NONE";
     CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
     for (int i = 0; i < net_def->op_size(); i++) {

From fed91f873fc73a9ec4d212a0d1abd3fc966eacc0 Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Mon, 1 Oct 2018 10:08:08 -0700
Subject: [PATCH 63/82] (Very small) allow trailing commas in assign or tuples
 (#11723)

Summary:
Allow trailing commas in assign statements or tuples, which also allows single element tuples.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11723

Differential Revision: D10052162

Pulled By: eellison

fbshipit-source-id: 344d908a3ad942a23ebd9f341794bc9734226aa8
---
 test/test_jit.py               | 7 ++++---
 torch/csrc/jit/script/parser.h | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 89aec6001f4582..24d8076d31365f 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -4925,7 +4925,6 @@ def bar():
             bar()
 
     def test_tuples(self):
-        @torch.jit.script
         def foo(i):
             a = (i + 4, i * 2)
             c = a
@@ -4937,10 +4936,12 @@ def foo(i):
             while False:
                 t0, t1 = c
                 c = (t1, t0)
-            return t0
+            x = (1,)
+            y = 1,
+            return t0, x, y
 
         v = torch.rand(10, 3)
-        self.assertEqual(v * 9, foo(v))
+        self.checkScript(foo, (v,))
 
         with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type \(Tensor, Tensor\)"):
             @torch.jit.script
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
index 14e5e4f5ae1354..64f7f9c8db935a 100644
--- a/torch/csrc/jit/script/parser.h
+++ b/torch/csrc/jit/script/parser.h
@@ -64,6 +64,8 @@ struct Parser {
       std::vector<Expr> exprs = { prefix };
       while(L.cur().kind != end) {
         L.expect(',');
+        if (L.cur().kind == end)
+          break;
         exprs.push_back(parseExp());
       }
       auto list = List<Expr>::create(prefix.range(), exprs);

From 006171fffc4f3fe7d8538f9f7a5b015d5bfc0332 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 1 Oct 2018 11:02:11 -0700
Subject: [PATCH 64/82] Back out "[pytorch][PR] Revert "Move CreateContext to
 global registry (#11688)"" (#12121)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12121

Pull Request resolved: https://github.com/pytorch/pytorch/pull/12055

Original commit changeset: 6ca9de65b707

Reviewed By: ezyang

Differential Revision: D10033396

fbshipit-source-id: ca9f4b2f7ef0561f619b833415d394a8b9972bf4
---
 aten/src/ATen/core/TensorImpl.h      | 31 ++++++++++++++--------------
 aten/src/ATen/core/context_base.cpp  | 11 ++++++++++
 aten/src/ATen/core/context_base.h    | 26 ++++++++++++++++-------
 caffe2/core/blob_serialization.cc    |  5 ++---
 caffe2/core/context.cc               |  4 ++++
 caffe2/core/context.h                | 11 ++--------
 caffe2/core/context_base.cc          |  1 +
 caffe2/core/context_gpu.cu           |  5 +++++
 caffe2/core/context_gpu.h            | 15 ++------------
 caffe2/core/hip/context_hip.cc       | 18 +++++++++-------
 caffe2/core/hip/context_hip.h        | 15 ++------------
 caffe2/core/tensor.h                 |  8 +++----
 caffe2/core/tensor_impl.cc           |  1 -
 caffe2/ideep/utils/ideep_context.h   | 11 ++--------
 caffe2/ideep/utils/ideep_register.cc |  3 +++
 caffe2/mkl/utils/mkl_context.cc      |  4 ++++
 caffe2/mkl/utils/mkl_context.h       | 11 ++--------
 caffe2/proto/caffe2_pb.h             | 19 ++++++++++++++++-
 caffe2/python/pybind_state.h         |  2 +-
 19 files changed, 108 insertions(+), 93 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index f685aa9d93e377..f899c7ec1d1446 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -365,6 +365,10 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_.device_type();
   }
 
+  at::Device GetDevice() const {
+    return storage_.device();
+  }
+
   /**
    * The static context of a tensor intuitively represents the device
    * type of a tensor; e.g., a CPU tensor is associated with the
@@ -376,18 +380,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return ::caffe2::get_static_context(device_type());
   }
 
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us, and then eliminate
-   * this method.
-   */
-  std::unique_ptr<at::BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
-  }
-
   /**
    * @brief Copies the data from a source tensor, with a contex provided to
    * carry out the underlying memcpy operation.  This method respects
@@ -429,8 +421,12 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         // knows how to copy between CPU and that context
         if (src.device_type() != ::at::DeviceType::CPU || device_type() == ::at::DeviceType::CPU) {
           if (!context) {
-            src.CreateContext()->CopyBytesToDevice(
-                numel() * itemsize(), src.data(), raw_mutable_data(data_type_), device_type());
+            CreateContext(src.GetDevice())
+                ->CopyBytesToDevice(
+                    numel() * itemsize(),
+                    src.data(),
+                    raw_mutable_data(data_type_),
+                    device_type());
           } else {
             CAFFE_ENFORCE(
                 context->device_type() == src.device_type(),
@@ -442,8 +438,11 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
           // In case source context is CPU, and target context is non-CPU
           // We'll have to create a Context from target and perform the
           // copy using that context
-          CreateContext()->CopyBytesFromCPU(
-              numel() * itemsize(), src.data(), raw_mutable_data(data_type_));
+          CreateContext(GetDevice())
+              ->CopyBytesFromCPU(
+                  numel() * itemsize(),
+                  src.data(),
+                  raw_mutable_data(data_type_));
         }
       }
     }
diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp
index e34c6880c0210a..f81bd81361305f 100644
--- a/aten/src/ATen/core/context_base.cpp
+++ b/aten/src/ATen/core/context_base.cpp
@@ -1,5 +1,16 @@
 #include <ATen/core/context_base.h>
 
+namespace at {
+
+C10_DEFINE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+} // namespace at
+
 namespace caffe2 {
 
 // TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 326cae5eb9691e..13bc885da344ee 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -6,11 +6,12 @@
 #include <memory>
 #include <unordered_map>
 
-#include <ATen/core/DeviceType.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
 #include <ATen/core/UniqueVoidPtr.h>
 #include <ATen/core/typeid.h>
-#include <ATen/core/ATenGeneral.h>
+#include <c10/util/Registry.h>
 
 namespace caffe2 {
 class Event;
@@ -31,11 +32,6 @@ class CAFFE2_API BaseStaticContext {
 
   virtual std::pair<void*, DeleterFnPtr> New(size_t nbytes) const = 0;
 
-  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext(
-      const caffe2::DeviceOption&) = 0;
-
   virtual DeviceType GetDeviceType() = 0;
 
   /*
@@ -184,6 +180,22 @@ class CAFFE2_API BaseContext {
   }
 };
 
+// Context constructor registry
+C10_DECLARE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+#define REGISTER_CONTEXT(type, ...) \
+  C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
+
+inline std::unique_ptr<at::BaseContext> CreateContext(
+    const at::Device& device) {
+  return at::ContextRegistry()->Create(device.type(), device);
+}
+
 } // namespace at
 
 namespace caffe2 {
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index a60fa12a49b6af..8126b3d59425a1 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -196,7 +196,7 @@ void TensorSerializer::Serialize(
   const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
   proto.set_data_type(data_type);
   StoreDeviceDetail(input, &proto);
-  auto uniq_ptr = input.GetStaticContext()->CreateContext();
+  auto uniq_ptr = CreateContext(input.GetDevice());
   // A lot of copypaste is error prone. Should we create a macro for this?
   switch (data_type) {
     case TensorProto_DataType_FLOAT:
@@ -371,8 +371,7 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
 void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
   // We create a local context for deserializing. Since Caffe2 contexts are
   // usually lightweight, this should not involve too much overhead.
-  auto uniq_ptr =
-      tensor->GetStaticContext()->CreateContext(proto.device_detail());
+  auto uniq_ptr = CreateContext(OptionToDevice(proto.device_detail()));
   auto context = uniq_ptr.get();
   context->SwitchToDevice(0);
   vector<int64_t> dims;
diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc
index 30819afdc4ce3f..94047eb71ee0b6 100644
--- a/caffe2/core/context.cc
+++ b/caffe2/core/context.cc
@@ -5,6 +5,10 @@
 #include <process.h>
 #endif
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::CPU, caffe2::CPUContext);
+} // namespace at
 namespace caffe2 {
 
 uint32_t RandomNumberSeed() {
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index aff66534d22198..af66396af72c44 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -50,6 +50,8 @@ class CAFFE2_API CPUContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_CPU);
   }
+  explicit CPUContext(const at::Device& device)
+      : CPUContext(DeviceToOption(device)) {}
 
   ~CPUContext() noexcept override {}
 
@@ -192,15 +194,6 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
     return data_and_deleter;
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CPUContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CPUContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return CPU;
   }
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
index b61b73cbad1cb5..99996d9e165b9b 100644
--- a/caffe2/core/context_base.cc
+++ b/caffe2/core/context_base.cc
@@ -1,4 +1,5 @@
 #include "context_base.h"
 
 namespace caffe2 {
+
 } // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 5ffc8c699154f2..f10fe067ac746c 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -57,6 +57,11 @@ CAFFE2_DEFINE_int(
     128,
     "The threshold in MB on how frequently to report memory changes");
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext);
+} // namespace at
+
 namespace caffe2 {
 
 ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() {
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index afb2e93fdd7fa8..65ba4a006a94af 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -142,6 +142,8 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   // The default cuda context constructor.
   explicit CUDAContext(const int gpu_id = -1);
   explicit CUDAContext(const DeviceOption& option);
+  explicit CUDAContext(const at::Device& device)
+      : CUDAContext(DeviceToOption(device)) {}
 
   ~CUDAContext() override {
     if (curand_generator_) {
@@ -385,19 +387,6 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CUDAContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CUDAContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<CUDAContext>(gpu_id);
-  }
-
   DeviceType GetDeviceType() override {
     return CUDA;
   }
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
index 0fabb20a642c94..3eadaf0e71b118 100644
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@@ -50,6 +50,11 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
                   128,
                   "The threshold in MB on how frequently to report memory changes");
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::HIP, caffe2::HIPContext);
+} // namespace at
+
 namespace caffe2 {
 
 thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
@@ -408,13 +413,12 @@ void HIPStaticContext::Delete(void* ptr) {
         g_hip_device_affiliation.erase(it);
         break;
     }
-    case HipMemoryPoolType::THC: 
-    {
-        HIP_ENFORCE(g_thc_allocator->Free(ptr));
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
-        }
-        break;
+    case HipMemoryPoolType::THC: {
+      HIP_ENFORCE(g_thc_allocator->Free(ptr));
+      if (FLAGS_caffe2_gpu_memory_tracking) {
+        g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
+      }
+      break;
     }
     }
 }
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index 5a7613cf934fd0..fb04336354e704 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -127,6 +127,8 @@ class HIPContext final : public BaseContext {
   // The default HIP context constructor.
   explicit HIPContext(const int gpu_id = -1);
   explicit HIPContext(const DeviceOption& option);
+  explicit HIPContext(const at::Device& device)
+      : HIPContext(DeviceToOption(device)) {}
 
   ~HIPContext() override {
     if (hiprand_generator_) {
@@ -374,19 +376,6 @@ class HIPStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<HIPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<HIPContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<HIPContext>(gpu_id);
-  }
-
   DeviceType GetDeviceType() override {
     return HIP;
   }
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 0563221feb2e83..bb478e415a8ce6 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -115,14 +115,14 @@ class CAFFE2_API Tensor final {
     return impl_.get()->GetStaticContext();
   }
 
-  std::unique_ptr<BaseContext> CreateContext() const {
-    return impl_.get()->CreateContext();
-  }
-
   DeviceType GetDeviceType() const {
     return impl_->device_type();
   }
 
+  at::Device GetDevice() const {
+    return impl_.get()->GetDevice();
+  }
+
   void CopyFrom(const Tensor& src, BaseContext* context = nullptr) const {
     impl_.get()->CopyFrom(*src.impl_.get(), context);
   }
diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc
index cff98c6101ea5d..dc8d666d6cb3a5 100644
--- a/caffe2/core/tensor_impl.cc
+++ b/caffe2/core/tensor_impl.cc
@@ -1,5 +1,4 @@
 #include "caffe2/core/tensor_impl.h"
-
 #include "caffe2/core/flags.h"
 
 CAFFE2_DEFINE_bool(
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index f50a4f34c66789..087078c507d164 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -20,6 +20,8 @@ class IDEEPContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_IDEEP);
   }
+  explicit IDEEPContext(const at::Device& device)
+      : IDEEPContext(DeviceToOption(device)) {}
 
   ~IDEEPContext() noexcept override {}
 
@@ -178,15 +180,6 @@ class IDEEPStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<IDEEPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<IDEEPContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return IDEEP;
   }
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index 9fe3108c032957..a0b80f8a8e401c 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -4,6 +4,9 @@
 #include <ideep_pin_singletons.hpp>
 #include "ideep_context.h"
 
+namespace at {
+REGISTER_CONTEXT(DeviceType::IDEEP, caffe2::IDEEPContext);
+} // namespace at
 namespace caffe2 {
 
 CAFFE_KNOWN_TYPE(ideep::tensor);
diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc
index 6e9075df43475f..8c66bc111282ac 100644
--- a/caffe2/mkl/utils/mkl_context.cc
+++ b/caffe2/mkl/utils/mkl_context.cc
@@ -3,6 +3,10 @@
 #include "mkl_context.h"
 #include "caffe2/core/event_cpu.h"
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::MKLDNN, caffe2::MKLContext);
+} // namespace at
 namespace caffe2 {
 
 // MKL events are the same as CPU events
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 0a7b5808a446be..8364026d91c651 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -29,6 +29,8 @@ class MKLContext : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_MKLDNN);
   }
+  explicit MKLContext(const at::Device& device)
+      : MKLContext(DeviceToOption(device)) {}
 
   ~MKLContext() override {}
 
@@ -155,15 +157,6 @@ class MKLStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<MKLContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<MKLContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return MKLDNN;
   }
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index 0a08c8db241e98..e0eb8e8dcdcdf3 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/core/DeviceType.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
 #include <caffe2/proto/caffe2.pb.h>
 
@@ -47,6 +47,10 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
   }
 }
 
+inline CAFFE2_API DeviceType ProtoToType(int p) {
+  return ProtoToType(static_cast<caffe2::DeviceTypeProto>(p));
+}
+
 inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   switch (t) {
     case DeviceType::CPU:
@@ -77,4 +81,17 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   }
 }
 
+inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
+    const at::Device& device) {
+  caffe2::DeviceOption option;
+  auto type = device.type();
+  option.set_device_type(TypeToProto(type));
+  option.set_device_id(device.index());
+  return option;
+}
+
+inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
+  return at::Device(ProtoToType(option.device_type()), option.device_id());
+}
+
 } // namespace caffe2
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index d18d728f282afa..dcb416b07a8fea 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -148,7 +148,7 @@ class TensorFetcher : public BlobFetcherBase {
     }
 
     if (result.copied) {
-      auto context = tensor.GetStaticContext()->CreateContext();
+      auto context = CreateContext(tensor.GetDeviceType());
       context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
       context->FinishDeviceComputation();
     }

From e43ffb014806757ad91fc70c9ec5c5f44a4eaa2e Mon Sep 17 00:00:00 2001
From: Duc Ngo <duc@fb.com>
Date: Mon, 1 Oct 2018 11:10:15 -0700
Subject: [PATCH 65/82] nomnigraph - easy - some code cleanup for
 transformations_test (#12101)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12101

clean up some duplicate test code

Reviewed By: ZolotukhinM

Differential Revision: D10051914

fbshipit-source-id: 698ff144a85e8c70572116c5ddb415cd2396b4e3
---
 caffe2/python/test_util.py            |   4 +
 caffe2/python/transformations_test.py | 126 ++++++++------------------
 2 files changed, 43 insertions(+), 87 deletions(-)

diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index dc1f7370132230..fd4b3ab030428d 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -30,6 +30,10 @@ def randBlobsFloat32(names, *dims, **kwargs):
         randBlobFloat32(name, *dims, **kwargs)
 
 
+def numOps(net):
+    return len(net.Proto().op)
+
+
 def str_compare(a, b, encoding="utf8"):
     if isinstance(a, bytes):
         a = a.decode(encoding)
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 26f5450605a1c1..502c844404c567 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -30,134 +30,86 @@
 
 
 class TestTransformations(tu.TestCase):
-    def test_transformer_AddNNPACK(self):
+    def _base_test_net(self):
         net = core.Net("net")
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        net.Relu(["Y"], ["Y2"])
+        return net
+
+    def _add_nnpack(self, net):
         transformer.AddNNPACK(net)
         assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
 
-    def test_transformer_FuseNNPACKConvRelu(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        net.Relu(["Y"], ["Y2"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
+    def _fuse_nnpack_convrelu(self, net, expected_result_num_ops,
+    expected_activation_arg=True):
+        self._add_nnpack(net)
         transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 1
+        self.assertEquals(tu.numOps(net), expected_result_num_ops)
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
             if tu.str_compare(arg.name, "activation"):
                 assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
-        assert has_activation_arg
+        if expected_activation_arg:
+            assert has_activation_arg
+        else:
+            assert not has_activation_arg
+
+    def test_transformer_AddNNPACK(self):
+        net = self._base_test_net()
+        net.Relu(["Y"], ["Y2"])
+        self._add_nnpack(net)
+
+    def test_transformer_FuseNNPACKConvRelu(self):
+        net = self._base_test_net()
+        net.Relu(["Y"], ["Y2"])
+        self._fuse_nnpack_convrelu(net, 1)
 
     def test_noFuseNNPACKConvRelu(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y2"])
         net.Relu(["Y"], ["Y3"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 3
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation") and tu.str_compare(arg.s, "Relu"):
-                has_activation_arg = True
-        assert not has_activation_arg
+        self._fuse_nnpack_convrelu(net, 3, expected_activation_arg=False)
 
     def test_transformer_FuseNNPACKConvReluNoInplace(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["X"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 1
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 1)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
 
     def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 1
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 1)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
 
     def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["X"])
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 2
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 2)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
         assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
 
     def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y2"])
         net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 2
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 2)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
         assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
 
     def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y"])
         net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y2"], ["Y2"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 2
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 2)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
         assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
 
     def test_transformer_SinkMaxPool(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.MaxPool(["Y"], ["Y1"], kernel=3)
         net.Relu(["Y1"], ["Y1"])
         transformer.SinkMaxPool(net)
@@ -205,7 +157,7 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         # Check that there is no numerical difference
@@ -256,7 +208,7 @@ def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, orde
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         # Check that there is no numerical difference
@@ -307,7 +259,7 @@ def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channe
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         print("pre")
@@ -365,7 +317,7 @@ def test_transformer_FuseConv3DBN(
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         # Check that there is no numerical difference

From 7d7d336c451e28eb00e3efbd640e29c73767a40b Mon Sep 17 00:00:00 2001
From: Yang Liu <yangliu991@fb.com>
Date: Mon, 1 Oct 2018 11:13:44 -0700
Subject: [PATCH 66/82] Back out "codemod cuda_gpu_id to device_id"

Summary:
Original commit changeset: f5614a5d2607

D9986213 is causing Multifeed Aggregator a [huge performance different](https://our.intern.facebook.com/intern/ads/analyze_canary/412951953278781781/) and is blocking aggregator push since last Friday night: https://fburl.com/feedtools/b6izvwjz
We need to land this revert ASAP to unblock aggregator push.

Reviewed By: orionr

Differential Revision: D10123245

fbshipit-source-id: d83da8e00a1250f5d09811a0a587c127e377aab2
---
 caffe2/contrib/nccl/cuda_nccl_op_gpu.cc       |  4 +-
 caffe2/contrib/nccl/nccl_ops_test.py          |  2 +-
 caffe2/contrib/prof/prof_dag_net.cc           |  4 +-
 .../tensorboard/tensorboard_exporter.py       |  2 +-
 caffe2/contrib/warpctc/ctc_ops_test.py        |  8 +-
 caffe2/core/blob_gpu_test.cc                  |  4 +-
 caffe2/core/context_gpu.cu                    |  2 +-
 caffe2/core/context_gpu.h                     |  6 +-
 caffe2/core/cudnn_wrappers.h                  |  6 +-
 caffe2/core/event_gpu.cc                      | 16 ++--
 caffe2/core/hip/event_hip.cc                  |  2 +-
 caffe2/core/memonger.cc                       |  4 +-
 caffe2/core/net_async_base.cc                 |  4 +-
 caffe2/core/net_async_dag_gpu.cc              |  2 +-
 caffe2/core/net_gpu_test.cc                   |  2 +-
 caffe2/core/operator.cc                       |  2 +-
 caffe2/mkl/utils/mkl_memory.cc                |  2 +-
 caffe2/observers/profile_observer_gpu.cc      |  4 +-
 caffe2/onnx/backend.cc                        |  2 +-
 caffe2/operators/load_save_op_gpu.cc          |  2 +-
 .../rnn/recurrent_network_executor_gpu.cc     |  4 +-
 caffe2/proto/caffe2.proto                     |  2 +-
 caffe2/python/cnn.py                          |  2 +-
 caffe2/python/core.py                         | 16 ++--
 caffe2/python/core_test.py                    | 82 +++++++++----------
 caffe2/python/data_parallel_model.py          |  6 +-
 caffe2/python/hypothesis_test_util.py         |  2 +-
 caffe2/python/model_helper.py                 |  4 +-
 caffe2/python/muji.py                         |  2 +-
 caffe2/python/net_printer.py                  |  4 +-
 caffe2/python/numa_test.py                    |  2 +-
 caffe2/python/onnx/backend_rep.py             |  2 +-
 caffe2/python/operator_test/load_save_test.py |  2 +-
 caffe2/python/operator_test/rnn_cell_test.py  |  2 +-
 caffe2/python/optimizer.py                    | 10 +--
 .../predictor/predictor_exporter_test.py      |  2 +-
 caffe2/python/pybind_state_dlpack.h           |  4 +-
 caffe2/utils/proto_utils.cc                   |  4 +-
 caffe2/utils/proto_utils_test.cc              |  4 +-
 .../pyHIPIFY/cuda_to_hip_mappings.py          |  2 +-
 40 files changed, 119 insertions(+), 119 deletions(-)

diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
index 4c5313ff4b3032..ea8b3494c6a036 100644
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -11,7 +11,7 @@ nccl::NCCLExecution getNCCLElements(
   // We either do an N-N op, or an N-1 op.
   CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
   nccl::NCCLExecution ex;
-  ex.stream_gpu_id = context.device_id();
+  ex.stream_gpu_id = context.cuda_gpu_id();
   ex.stream = context.cuda_stream();
   ex.root = op->template GetSingleArgument<int>("root", 0);
   ex.elements.resize(op->InputSize());
@@ -204,7 +204,7 @@ std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
   for (int i = 0; i < def.input().size(); ++i) {
     DeviceOption dev;
     dev.set_device_type(1);
-    dev.set_device_id(i);
+    dev.set_cuda_gpu_id(i);
     opt.push_back(dev);
   }
   return std::make_pair(opt, opt);
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
index f6c22a7d750127..7e8a61e9de241d 100644
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -21,7 +21,7 @@
 def gpu_device(i):
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.device_id = i
+    device_option.cuda_gpu_id = i
     return device_option
 
 
diff --git a/caffe2/contrib/prof/prof_dag_net.cc b/caffe2/contrib/prof/prof_dag_net.cc
index c8678652c3138f..16917ddc154fc9 100644
--- a/caffe2/contrib/prof/prof_dag_net.cc
+++ b/caffe2/contrib/prof/prof_dag_net.cc
@@ -33,9 +33,9 @@ void ProfDAGNet::ValidateOpTensorDevices() {
       had_mismatches = true;
       LOG(INFO) << "== PERFORMANCE WARNING == \n"
                 << " Operator " << node.operator_->debug_def().type()
-                << " expects GPU " << mismatch.second.first.device_id()
+                << " expects GPU " << mismatch.second.first.cuda_gpu_id()
                 << " but tensor [" << mismatch.first << "] is on GPU "
-                << mismatch.second.second.device_id();
+                << mismatch.second.second.cuda_gpu_id();
     }
   }
   if (!had_mismatches) {
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index cc2c3d85c96877..93ade48e7d267d 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -177,7 +177,7 @@ def _tf_device(device_option):
     if device_option.device_type == caffe2_pb2.CPU:
         return "/cpu:*"
     if device_option.device_type == caffe2_pb2.CUDA:
-        return "/gpu:{}".format(device_option.device_id)
+        return "/gpu:{}".format(device_option.cuda_gpu_id)
     raise Exception("Unhandled device", device_option)
 
 
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 3b21c8b667473c..25bb0a39e3a965 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -79,11 +79,11 @@ def test_ctc_cost_cpu(self):
     def test_ctc_cost_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=False)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=False,
             skip_input_lengths=True)
 
@@ -99,10 +99,10 @@ def test_ctc_forward_only_cpu(self):
     def test_ctc_forward_only_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=True)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=True,
             skip_input_lengths=True)
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 8b4127e403a452..55eafdede7269a 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -195,7 +195,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     }
     EXPECT_TRUE(tensor_proto.has_device_detail());
     EXPECT_EQ(tensor_proto.device_detail().device_type(), PROTO_CUDA);
-    EXPECT_EQ(tensor_proto.device_detail().device_id(), gpu_id);
+    EXPECT_EQ(tensor_proto.device_detail().cuda_gpu_id(), gpu_id);
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
@@ -205,7 +205,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if we force the restored blob on a different device, we
     // can still get so.
     blob.Reset();
-    proto.mutable_tensor()->mutable_device_detail()->set_device_id(0);
+    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
     EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index f10fe067ac746c..0d9e2686212a1e 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -256,7 +256,7 @@ CUDAContext::CUDAContext(const int gpu_id)
 
 CUDAContext::CUDAContext(const DeviceOption& option)
     : gpu_id_(
-          option.has_device_id() ? RectifyGPUID(option.device_id())
+          option.has_cuda_gpu_id() ? RectifyGPUID(option.cuda_gpu_id())
                                    : CaffeCudaGetDevice()),
       random_seed_(
           option.has_random_seed() ? option.random_seed()
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 65ba4a006a94af..ce73f5f942828b 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -184,7 +184,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     }
   }
 
-  inline int device_id() const {
+  inline int cuda_gpu_id() const {
     return gpu_id_;
   }
 
@@ -283,7 +283,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   }
 
   static bool IsStreamFree(const DeviceOption& option, int stream_id) {
-    auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
+    auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
@@ -393,7 +393,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
     device->set_device_type(TypeToProto(GetDeviceType()));
-    device->set_device_id(GetGPUIDForPointer(data));
+    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
   }
 
  protected:
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index dea138e9ad507c..1bd39fa62a399f 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -122,9 +122,9 @@ class CuDNNWrapper {
   void with_cudnn_state(size_t state_idx, F&& f) {
     CAFFE_ENFORCE(
         state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
-    auto& sync_state = cudnn_states()[context_->device_id()][state_idx];
+    auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
 
-    DeviceGuard dg(context_->device_id());
+    DeviceGuard dg(context_->cuda_gpu_id());
 
     // We need to serialize execution on the CuDNNState as we can't
     // allow multiple threads to race through the cudaEventRecord
@@ -132,7 +132,7 @@ class CuDNNWrapper {
     // execution)
     std::lock_guard<std::mutex> g(sync_state.mutex);
     if (!sync_state.state.get()) {
-      sync_state.state.reset(new CuDNNState(context_->device_id()));
+      sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
     }
     CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
   }
diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
index 44aec8d3f2b8f4..6253ca19c9ab70 100644
--- a/caffe2/core/event_gpu.cc
+++ b/caffe2/core/event_gpu.cc
@@ -9,21 +9,21 @@ namespace caffe2 {
 struct CudaEventWrapper {
   explicit CudaEventWrapper(const DeviceOption& option)
       : cuda_stream_(nullptr),
-        device_id_(option.device_id()),
+        cuda_gpu_id_(option.cuda_gpu_id()),
         status_(EventStatus::EVENT_INITIALIZED) {
     CAFFE_ENFORCE(option.device_type(), PROTO_CUDA);
-    DeviceGuard g(device_id_);
+    DeviceGuard g(cuda_gpu_id_);
     CUDA_ENFORCE(cudaEventCreate(
         &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
   }
   ~CudaEventWrapper() {
-    DeviceGuard g(device_id_);
+    DeviceGuard g(cuda_gpu_id_);
     CUDA_CHECK(cudaEventDestroy(cuda_event_));
   }
 
   cudaEvent_t cuda_event_;
   cudaStream_t cuda_stream_;
-  int device_id_;
+  int cuda_gpu_id_;
 
   std::atomic<int> status_;
   std::mutex mutex_recorded_;
@@ -65,12 +65,12 @@ void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
       const auto& current_device = CaffeCudaGetDevice();
       CAFFE_ENFORCE_EQ(
           current_device,
-          wrapper->device_id_,
+          wrapper->cuda_gpu_id_,
           "When you call EventRecordCUDA, your current device should be the same "
           "as the device specified by the event.");
       CAFFE_ENFORCE_EQ(
           current_device,
-          static_cast<const CUDAContext*>(context)->device_id());
+          static_cast<const CUDAContext*>(context)->cuda_gpu_id());
       CUDA_ENFORCE(cudaEventRecord(
           wrapper->cuda_event_,
           static_cast<const CUDAContext*>(context)->cuda_stream()));
@@ -96,7 +96,7 @@ void EventFinishCUDA(const Event* event) {
 
   if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
     // ok, even if event is already completed and status was not yet updated
-    DeviceGuard g(wrapper->device_id_);
+    DeviceGuard g(wrapper->cuda_gpu_id_);
     auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
     if (cudaResult == cudaSuccess) {
       wrapper->status_ = EventStatus::EVENT_SUCCESS;
@@ -127,7 +127,7 @@ void EventWaitCUDACUDA(const Event* event, void* context) {
     if (context_stream != event_stream) {
       // CAFFE_ENFORCE_EQ(
       //    CaffeCudaGetDevice(),
-      //    static_cast<const CUDAContext*>(context)->device_id());
+      //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
       CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
     }
   }
diff --git a/caffe2/core/hip/event_hip.cc b/caffe2/core/hip/event_hip.cc
index ebec9c593e6eee..6f0db4642ddbba 100644
--- a/caffe2/core/hip/event_hip.cc
+++ b/caffe2/core/hip/event_hip.cc
@@ -138,7 +138,7 @@ void EventWaitHIPHIP(const Event* event, void* context)
         {
             // CAFFE_ENFORCE_EQ(
             //    CaffeCudaGetDevice(),
-            //    static_cast<const CUDAContext*>(context)->device_id());
+            //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
             HIP_CHECK(hipStreamWaitEvent(context_stream, wrapper->hip_event_, 0));
         }
     }
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index 87633fadebe34e..d9816e787ba88c 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -176,7 +176,7 @@ class ComputeBlobRecyclingForDag {
         // cuda device option but whose inputs/outputs are on CPU
         if (net.op(op_index).type() == "CopyGPUToCPU") {
           blob_device_[output].set_device_type(0);
-          blob_device_[output].set_device_id(0);
+          blob_device_[output].set_cuda_gpu_id(0);
         }
       }
     }
@@ -478,7 +478,7 @@ class ComputeBlobRecyclingForDag {
       const DeviceOption& device_option) {
     const DeviceOption& blob_device = blob_device_[blob_name];
     if (device_option.device_type() != blob_device.device_type() ||
-        device_option.device_id() != blob_device.device_id()) {
+        device_option.cuda_gpu_id() != blob_device.cuda_gpu_id()) {
       return false;
     }
     for (const int token : req_tokens_[blob_name]) {
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index a694a4865c6cb3..ce5fdbe7b7ed80 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -157,7 +157,7 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
         numa_node_id);
     return pool_getter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
   } else if (device_option.device_type() == PROTO_CUDA) {
-    auto gpu_id = device_option.device_id();
+    auto gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
         "Invalid GPU id: " + caffe2::to_string(gpu_id));
@@ -173,7 +173,7 @@ int AsyncNetBase::stream(int task_id) {
   const auto& device_option = event(task_id).GetDeviceOption();
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.device_id();
+    int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= getStreamCounters().size()) {
       getStreamCounters().resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 86d0b4d1d271dc..550a760826edd8 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -112,7 +112,7 @@ AsyncDAGNet::AsyncDAGNet(
 int AsyncDAGNet::stream(const DeviceOption& device_option) {
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.device_id();
+    int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_gpu_test.cc b/caffe2/core/net_gpu_test.cc
index fab56112ec227c..eaea9377f9bcac 100644
--- a/caffe2/core/net_gpu_test.cc
+++ b/caffe2/core/net_gpu_test.cc
@@ -124,7 +124,7 @@ TEST(NetTest, DISABLED_ChainingForDifferentDevices) {
           type: "NetTestDummy"
           device_option {
             device_type: 1
-            device_id: 1
+            cuda_gpu_id: 1
           }
         }
 )DOC";
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 8115ae3aab6a3c..79be08c03b2325 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -649,7 +649,7 @@ std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
           &blob_device);
 
       if (blob_device.device_type() == PROTO_CUDA &&
-          blob_device.device_id() != op_device.device_id()) {
+          blob_device.cuda_gpu_id() != op_device.cuda_gpu_id()) {
         mismatches[blob_name] = std::make_pair(op_device, blob_device);
       } else if (
           blob_device.device_type() == PROTO_HIP &&
diff --git a/caffe2/mkl/utils/mkl_memory.cc b/caffe2/mkl/utils/mkl_memory.cc
index 9d4f347a13cb81..3f05f9c5d24bde 100644
--- a/caffe2/mkl/utils/mkl_memory.cc
+++ b/caffe2/mkl/utils/mkl_memory.cc
@@ -26,7 +26,7 @@ static vector<int64_t> GetMKLTensorInfo(
   const mkl::MKLMemory<T>* tc = static_cast<const mkl::MKLMemory<T>*>(c);
   *capacity = tc->size() * sizeof(T);
   device->set_device_type(PROTO_MKLDNN);
-  device->set_device_id(0);
+  device->set_cuda_gpu_id(0);
   return tc->dims();
 }
 
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index 5bd9b0a11b0921..bf4e20b7904711 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -70,7 +70,7 @@ void ProfileOperatorObserver::Start() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->device_id());
+    cudaSetDevice(context->cuda_gpu_id());
     cudaEventCreate(&start_);
     cudaEventRecord(start_, context->cuda_stream());
 
@@ -92,7 +92,7 @@ void ProfileOperatorObserver::Stop() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->device_id());
+    cudaSetDevice(context->cuda_gpu_id());
     cudaEventCreate(&stop_);
     cudaEventRecord(stop_, context->cuda_stream());
     cudaEventSynchronize(stop_);
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 8a21fa0acf679c..2350910febff27 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -65,7 +65,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
       {DeviceType::CUDA, caffe2::DeviceType::CUDA}};
   caffe2::DeviceOption d;
   d.set_device_type(static_cast<int32_t>(m.at(onnx_device.type)));
-  d.set_device_id(onnx_device.device_id);
+  d.set_cuda_gpu_id(onnx_device.device_id);
   return d;
 }
 
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
index 8458fab901ed8b..cd70e9c2b5df2f 100644
--- a/caffe2/operators/load_save_op_gpu.cc
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -8,7 +8,7 @@ void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
   if (proto->has_tensor()) {
     auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
     device_detail->set_device_type(PROTO_CUDA);
-    device_detail->set_device_id(CaffeCudaGetDevice());
+    device_detail->set_cuda_gpu_id(CaffeCudaGetDevice());
   }
 }
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
index 061f54d3a4cb0e..e16e2073f7fd12 100644
--- a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
@@ -72,11 +72,11 @@ void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
       if (gpu_id == -1 &&
           rnn_op.op->device_option().device_type() ==
               DeviceTypeProto::PROTO_CUDA) {
-        gpu_id = rnn_op.op->device_option().device_id();
+        gpu_id = rnn_op.op->device_option().cuda_gpu_id();
       } else {
         CAFFE_ENFORCE(
             rnn_op.op->device_option().device_type() == 0 ||
-                rnn_op.op->device_option().device_id() == gpu_id,
+                rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
             "RNN Executor only supports ops on one GPU");
       }
 
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 21bdec2c6883b1..71870010293492 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -135,7 +135,7 @@ message DeviceOption {
   // optional DeviceType device_type = 1 [ default = CPU ];
   optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
   // [CUDA specific] the cuda gpu id.
-  optional int32 device_id = 2;
+  optional int32 cuda_gpu_id = 2;
   // [general] The random seed to start the device random number generator with.
   optional uint32 random_seed = 3;
   // [general] What node this op should execute on.
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index f9ccf92d75099b..f927020e6ae88f 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -236,5 +236,5 @@ def CPU(self):
     def GPU(self, gpu_id=0):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = gpu_id
+        device_option.cuda_gpu_id = gpu_id
         return device_option
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 4f683daa368240..6850c02fc13964 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -84,7 +84,7 @@ def IsOperatorWithEngine(op_type, engine):
 
 def DeviceOption(
     device_type,
-    device_id=0,
+    cuda_gpu_id=0,
     random_seed=None,
     node_name=None,
     numa_node_id=None,
@@ -92,7 +92,7 @@ def DeviceOption(
 ):
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
-    option.device_id = device_id
+    option.cuda_gpu_id = cuda_gpu_id
     if node_name is not None:
         option.node_name = node_name
     if random_seed is not None:
@@ -115,7 +115,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr
     if not opt1.device_type or not opt2.device_type:
         # At least one option is for CPU, check if both are for CPU.
         return not opt1.device_type and not opt2.device_type
-    return opt1.device_id == opt2.device_id
+    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
 
 
 def InferBlobDevices(net):
@@ -2111,7 +2111,7 @@ def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = gpu_id
+        device_option.cuda_gpu_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
@@ -2286,7 +2286,7 @@ def copy_func_between_devices(src, dst):
         return None
 
     if src.device_type == CUDA and dst.device_type == CUDA:
-        if src.device_id == dst.device_id:
+        if src.cuda_gpu_id == dst.cuda_gpu_id:
             return None
         else:
             def fun(net, *args, **kw):
@@ -2312,10 +2312,10 @@ def fun(net, *args, **kw):
 def device_equal(src, dst):
     '''
     We are using this fucntion instead of == operator because optional-value
-    comparison between empty device_options and {device_type:0, device_id:0}
+    comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
     returns not equal in some cases.
     '''
-    return src.device_type == dst.device_type and src.device_id == dst.device_id
+    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
 
 
 def update_placeholder_op_output(op, blob_to_device):
@@ -2429,7 +2429,7 @@ def _gen_new_name(blob, device_option):
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
                         elif device_option.device_type == CUDA:
-                            suffix = '_cuda_' + str(device_option.device_id)
+                            suffix = '_cuda_' + str(device_option.cuda_gpu_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 2f6dedbfd80c83..7120843f33152d 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -83,17 +83,17 @@ def testDeviceScope(self):
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
             self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-            self.assertEqual(op.device_option.device_id, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
             override_device.device_type = caffe2_pb2.CPU
@@ -109,13 +109,13 @@ def testDeviceScope(self):
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
                 self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-                self.assertEqual(op.device_option.device_id, 1)
+                self.assertEqual(op.device_option.cuda_gpu_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
                 self.assertEqual(len(op.output), 1)
@@ -255,7 +255,7 @@ class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
             control_input="z", device_option=device_option,
@@ -271,7 +271,7 @@ def testCreate(self):
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertTrue(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
@@ -574,7 +574,7 @@ def test_check_equal_default_value(self):
         opt2 = caffe2_pb2.DeviceOption()
         opt1.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.device_id = 5
+        opt1.cuda_gpu_id = 5
         # opt1 still is on CPU, so the options should be equal
         self.assertTrue(core.device_option_equal(opt1, opt2))
         opt2.device_type = 0
@@ -649,7 +649,7 @@ class TestInferDevice(test_util.TestCase):
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         self.cuda_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
@@ -748,7 +748,7 @@ def test_inject_copy(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
 
@@ -765,7 +765,7 @@ def test_inject_copy(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
         self.assertNotEqual(blob_to_device["fc_w"], device_option)
@@ -775,7 +775,7 @@ def test_cross_nets(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
         const = init_net.ConstantFill([], 'const', shape=[], value=1.)
@@ -791,12 +791,12 @@ def test_cross_nets(self):
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "fc_w_cuda_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "fc_b_cuda_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
@@ -804,7 +804,7 @@ def test_cross_nets(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
@@ -822,7 +822,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -832,7 +832,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -844,7 +844,7 @@ def test_cross_nets(self):
   type: "FC"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -855,7 +855,7 @@ def test_cross_nets(self):
   type: "Add"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -870,7 +870,7 @@ def test_cross_nets_no_change(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
@@ -887,7 +887,7 @@ def test_cross_nets_no_change(self):
         self.assertEqual(op.input[1], "fc_w")
         self.assertEqual(op.input[2], "fc_b")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         """
 For reference, net.Proto() should be like:
 name: ""
@@ -900,7 +900,7 @@ def test_cross_nets_no_change(self):
   type: "FC"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -912,7 +912,7 @@ def test_inject_copy_multi_use(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             net.Relu("data", "relu1")
@@ -920,10 +920,10 @@ def test_inject_copy_multi_use(self):
         with core.DeviceScope(device_option):
             net.Relu("data", "relu3")
         net.Relu("data", "relu4")
-        device_option.device_id = 0
+        device_option.cuda_gpu_id = 0
         with core.DeviceScope(device_option):
             net.Relu("data", "relu5")
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             net.Relu("data", "relu6")
 
@@ -931,12 +931,12 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "data_cuda_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
         self.assertEqual(op.type, "Relu")
@@ -945,7 +945,7 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
@@ -955,18 +955,18 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "data_cuda_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.input[0], "data_cuda_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu6")
         """
@@ -979,7 +979,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -989,7 +989,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -1005,7 +1005,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -1021,7 +1021,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 0
+    cuda_gpu_id: 0
   }
 }
 op {
@@ -1031,7 +1031,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 0
+    cuda_gpu_id: 0
   }
 }
 op {
@@ -1041,7 +1041,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -1060,7 +1060,7 @@ def test_inject_copy_placeholder_ops(self):
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
             gpu_device[i].device_type = caffe2_pb2.CUDA
-            gpu_device[i].device_id = 0
+            gpu_device[i].cuda_gpu_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
         recv_node = 'node:1'
@@ -1100,12 +1100,12 @@ def test_inject_copy_placeholder_ops(self):
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
         self.assertEqual(op.type, placeholder_send)
@@ -1128,7 +1128,7 @@ def test_blob_inplace(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
         with core.DeviceScope(device_option):
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 749c8b12c930e8..89770dc6ea7d9a 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -813,7 +813,7 @@ def builder_fun(model):
 
     device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
 
-    namescope = "{}_{}/".format(device_prefix, device.device_id)
+    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
     for op in mnet.Proto().op:
         if "RecurrentNetwork" in op.type:
             raise("RecurrentNetwork conversion not yet supported")
@@ -1540,7 +1540,7 @@ def _AnalyzeOperators(model):
             continue
 
         op_dev = op.device_option
-        op_gpu = op_dev.device_id
+        op_gpu = op_dev.cuda_gpu_id
 
         # This avoids failing on operators that are only for CPU
         if op_dev.device_type != caffe2_pb2.CUDA:
@@ -1904,7 +1904,7 @@ def _InterleaveOps(model):
     new_ops = []
     ops = {d: [] for d in range(num_devices)}
     for op in orig_ops:
-        ops[op.device_option.device_id].append(op)
+        ops[op.device_option.cuda_gpu_id].append(op)
 
     for j in range(num_ops_per_dev):
         tp = None
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 8470df1588717f..5cc18f99bd9eb9 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -259,7 +259,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
-    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, device_id=i)
+    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
      for i in range(workspace.NumCudaDevices())]
     if workspace.has_gpu_support else [])
 
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index 1e881d27f49dc8..f8e3f32bb2c225 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -596,7 +596,7 @@ def rename_list(proto_list):
                             rename_list(step_op.output)
                             if device is not None:
                                 step_op.device_option.device_type = device.device_type
-                                step_op.device_option.device_id = device.device_id
+                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
 
                         rename_list(arg.n.external_input)
                         rename_list(arg.n.external_output)
@@ -610,7 +610,7 @@ def rename_list(proto_list):
 
             if device is not None:
                 op.device_option.device_type = device.device_type
-                op.device_option.device_id = device.device_id
+                op.device_option.cuda_gpu_id = device.cuda_gpu_id
             validate_op(op)
             predict_proto.op.extend([op])
             known_blobs.update(op.output)
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index 2f2b5aced6640e..b407f96d2391f8 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -26,7 +26,7 @@ def OnGPU(gpu_id):
   """
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.device_id = gpu_id
+    device_option.cuda_gpu_id = gpu_id
     return device_option
 
 
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 7583f863b1f5ad..4b5cddb61d244e 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -268,11 +268,11 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False):
 
 def format_device_option(dev_opt):
     if not dev_opt or not (
-            dev_opt.device_type or dev_opt.device_id or dev_opt.node_name):
+            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
         return None
     return call(
         'DeviceOption',
-        [dev_opt.device_type, dev_opt.device_id, "'%s'" % dev_opt.node_name])
+        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
 
 
 @Printer.register(OperatorDef)
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
index 3178345cf46e21..8d3a362dcdf725 100644
--- a/caffe2/python/numa_test.py
+++ b/caffe2/python/numa_test.py
@@ -27,7 +27,7 @@ def build_test_net(net_name):
 
     gpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_option.device_type = caffe2_pb2.CUDA
-    gpu_device_option.device_id = 0
+    gpu_device_option.cuda_gpu_id = 0
 
     net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
                         device_option=gpu_device_option)
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index 5802e49de526dc..8cc3f9e2fa98eb 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -24,7 +24,7 @@ def __init__(self, init_net, predict_net, workspace, uninitialized):
     @property
     def _name_scope(self):
         if self.predict_net.device_option.device_type == caffe2_pb2.CUDA:
-            return 'gpu_{}'.format(self.predict_net.device_option.device_id)
+            return 'gpu_{}'.format(self.predict_net.device_option.cuda_gpu_id)
         return ''
 
     def run(self, inputs, **kwargs):
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index b90a7f84b4ed8a..07f378beb18ff0 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -89,7 +89,7 @@ def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll):
                     self.assertEqual(proto.tensor.device_detail.device_type,
                                      device_type)
                     if device_type == caffe2_pb2.CUDA:
-                        self.assertEqual(proto.tensor.device_detail.device_id,
+                        self.assertEqual(proto.tensor.device_detail.cuda_gpu_id,
                                          gpu_id)
 
             blobs = [str(i) for i in range(len(arrays))]
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 66ac07dbdca079..9d9bb38e178517 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1216,7 +1216,7 @@ def test_lstm_extract_predictor_net(self):
                     if arg.name == "step_net":
                         for step_op in arg.n.op:
                             self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.device_id)
+                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
                     elif arg.name == 'backward_step_net':
                         self.assertEqual(caffe2_pb2.NetDef(), arg.n)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index ddd5871f7d4b74..0c5b18b0b6ab11 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -83,7 +83,7 @@ def make_unique_blob_name(self, base_str):
 
         if current_scope.device_type == caffe2_pb2.CUDA:
             return self.get_gpu_blob_name(
-                base_str, current_scope.device_id, current_scope.node_name
+                base_str, current_scope.cuda_gpu_id, current_scope.node_name
             )
         else:
             return self.get_cpu_blob_name(base_str, current_scope.node_name)
@@ -279,7 +279,7 @@ def _run(self, net, param_init_net, param_info):
         # to include device information.
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
+            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
             shape=[1],
             value=1.0
         )
@@ -488,12 +488,12 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.device_id),
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
         WD = param_init_net.ConstantFill(
-            [], "wd_{}_{}".format(dev.device_type, dev.device_id),
+            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1], value=self.weight_decay
         )
 
@@ -1160,7 +1160,7 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.device_id),
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
index ef11246bdfcc9b..b4c71535debe66 100644
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -193,7 +193,7 @@ def test_load_device_scope(self):
 
         # check device options
         for op in list(init_net.Proto().op) + list(predict_init_net.Proto().op):
-            self.assertEqual(1, op.device_option.device_id)
+            self.assertEqual(1, op.device_option.cuda_gpu_id)
             self.assertEqual(caffe2_pb2.CPU, op.device_option.device_type)
 
     def test_db_fails_without_params(self):
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 6db4ae42b84742..679152c788132e 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -34,7 +34,7 @@ class DLPackWrapper {
         "Unsupported device type: ",
         device_option.device_type());
     tensor_context.device_type = *device_type_ptr;
-    tensor_context.device_id = device_option.device_id();
+    tensor_context.device_id = device_option.cuda_gpu_id();
 
     if (tensor->size() <= 0) {
       tensor->Resize(0);
@@ -87,7 +87,7 @@ class DLPackWrapper {
     int dlpack_device_id = dlTensor->ctx.device_id;
     CAFFE_ENFORCE_EQ(
         dlpack_device_id,
-        device_option.device_id(),
+        device_option.cuda_gpu_id(),
         "Expected same device id for DLPack and C2 tensors");
 
     std::vector<int64_t> dims;
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index dd80282238a80b..dc8e088eba97c5 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -30,7 +30,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
     case PROTO_CPU:
       return option.numa_node_id();
     case PROTO_CUDA:
-      return option.device_id();
+      return option.cuda_gpu_id();
     case PROTO_MKLDNN:
       return option.numa_node_id();
     case PROTO_HIP:
@@ -43,7 +43,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
 C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
   return (
       lhs.device_type() == rhs.device_type() &&
-      lhs.device_id() == rhs.device_id() &&
+      lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
       lhs.hip_gpu_id() == rhs.hip_gpu_id() &&
       lhs.node_name() == rhs.node_name() &&
       lhs.numa_node_id() == rhs.numa_node_id());
diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
index 5d8fb86b34e3bb..c9f37f4c98c290 100644
--- a/caffe2/utils/proto_utils_test.cc
+++ b/caffe2/utils/proto_utils_test.cc
@@ -11,9 +11,9 @@ TEST(ProtoUtilsTest, IsSameDevice) {
   EXPECT_FALSE(IsSameDevice(a, b));
   b.set_node_name("my_node");
   EXPECT_TRUE(IsSameDevice(a, b));
-  b.set_device_id(2);
+  b.set_cuda_gpu_id(2);
   EXPECT_FALSE(IsSameDevice(a, b));
-  a.set_device_id(2);
+  a.set_cuda_gpu_id(2);
   EXPECT_TRUE(IsSameDevice(a, b));
   a.set_device_type(DeviceTypeProto::PROTO_CUDA);
   b.set_device_type(DeviceTypeProto::PROTO_CPU);
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 3a98a4cb7d9f3e..113403fd87bbf4 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -2216,7 +2216,7 @@
     "CURAND_ENFORCE" :("HIPRAND_ENFORCE", API_CAFFE2),
     "curandGenerateUniform" : ("hiprandGenerateUniform", API_CAFFE2),
     "curand_generator" : ("hiprand_generator", API_CAFFE2),
-    "device_id" : ("hip_gpu_id", API_CAFFE2),
+    "cuda_gpu_id" : ("hip_gpu_id", API_CAFFE2),
     "CaffeCudaGetDevice" : ("CaffeHipGetDevice", API_CAFFE2),
 }
 

From ecb3835387e4f5b7f8bfacc1b4d14aeaf687aecd Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@fb.com>
Date: Mon, 1 Oct 2018 11:17:44 -0700
Subject: [PATCH 67/82] change \gamma to \Gamma (#12214)

Summary:
- revert `\gamma` changes at landed PR: https://github.com/pytorch/pytorch/pull/12126
- minor fix for docs of `torch.norm()`

SsnL
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12214

Differential Revision: D10127337

Pulled By: weiyangfb

fbshipit-source-id: 15eb8abda39ec9e8b2e815e2a22096cae786995a
---
 torch/_torch_docs.py | 6 +++---
 torch/functional.py  | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 44b963f38d3bb8..8f3c1ae6ebf73c 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1210,7 +1210,7 @@ def parse_kwargs(desc):
 Computes the logarithmic derivative of the gamma function on `input`.
 
 .. math::
-    \psi(x) = \frac{d}{dx} \ln\left(\gamma\left(x\right)\right) = \frac{\gamma'(x)}{\gamma(x)}
+    \psi(x) = \frac{d}{dx} \ln\left(\Gamma\left(x\right)\right) = \frac{\Gamma'(x)}{\Gamma(x)}
 
 Args:
     input (Tensor): the tensor to compute the digamma function on
@@ -2936,9 +2936,9 @@ def parse_kwargs(desc):
 Computes the multivariate log-gamma function with dimension :math:`p` element-wise, given by
 
 .. math::
-    \log(\gamma_{p}(a)) = C + \displaystyle \sum_{i=1}^{p} \log\left(\gamma\left(a - \frac{i - 1}{2}\right)\right)
+    \log(\Gamma_{p}(a)) = C + \displaystyle \sum_{i=1}^{p} \log\left(\Gamma\left(a - \frac{i - 1}{2}\right)\right)
 
-where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\gamma(.)` is the Gamma function.
+where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\Gamma(\cdot)` is the Gamma function.
 
 If any of the elements are less than or equal to :math:`\frac{p - 1}{2}`, then an error
 is thrown.
diff --git a/torch/functional.py b/torch/functional.py
index 47625171c519e2..4290f78585a965 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -648,7 +648,7 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None):
 
     Args:
         input (Tensor): the input tensor
-        p (int, float, inf, -inf, 'fro', 'nuc'): the order of norm
+        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
             The following norms can be calculated:
 
             =====  ============================  ==========================
@@ -667,13 +667,14 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None):
             calculated when the input tensor only has one dimension. If the input
             tensor has more than two dimensions, the vector norm will be applied to
             last dimension.
-        keepdim (bool): whether the output tensors have :attr:`dim`
+        keepdim (bool, optional): whether the output tensors have :attr:`dim`
             retained or not. Ignored if :attr:`dim` = ``None`` and
-            :attr:`out` = ``None``.
+            :attr:`out` = ``None``. Default: ``False``
         out (Tensor, optional): the output tensor. Ignored if
             :attr:`dim` = ``None`` and :attr:`out` = ``None``.
 
     Example::
+
         >>> import torch
         >>> a = torch.arange(9, dtype= torch.float) - 4
         >>> b = a.reshape((3, 3))

From 3010dc4208df1d2bd8058cfafeffe645f454595c Mon Sep 17 00:00:00 2001
From: Rick Ratmansky <rratmansky@fb.com>
Date: Mon, 1 Oct 2018 12:09:39 -0700
Subject: [PATCH 68/82] Revert D10123245: Back out "codemod cuda_gpu_id to
 device_id"

Differential Revision:
D10123245

Original commit changeset: d83da8e00a12

fbshipit-source-id: fca91fea58b7df208edc2e218a1d514f9821ec7b
---
 caffe2/contrib/nccl/cuda_nccl_op_gpu.cc       |  4 +-
 caffe2/contrib/nccl/nccl_ops_test.py          |  2 +-
 caffe2/contrib/prof/prof_dag_net.cc           |  4 +-
 .../tensorboard/tensorboard_exporter.py       |  2 +-
 caffe2/contrib/warpctc/ctc_ops_test.py        |  8 +-
 caffe2/core/blob_gpu_test.cc                  |  4 +-
 caffe2/core/context_gpu.cu                    |  2 +-
 caffe2/core/context_gpu.h                     |  6 +-
 caffe2/core/cudnn_wrappers.h                  |  6 +-
 caffe2/core/event_gpu.cc                      | 16 ++--
 caffe2/core/hip/event_hip.cc                  |  2 +-
 caffe2/core/memonger.cc                       |  4 +-
 caffe2/core/net_async_base.cc                 |  4 +-
 caffe2/core/net_async_dag_gpu.cc              |  2 +-
 caffe2/core/net_gpu_test.cc                   |  2 +-
 caffe2/core/operator.cc                       |  2 +-
 caffe2/mkl/utils/mkl_memory.cc                |  2 +-
 caffe2/observers/profile_observer_gpu.cc      |  4 +-
 caffe2/onnx/backend.cc                        |  2 +-
 caffe2/operators/load_save_op_gpu.cc          |  2 +-
 .../rnn/recurrent_network_executor_gpu.cc     |  4 +-
 caffe2/proto/caffe2.proto                     |  2 +-
 caffe2/python/cnn.py                          |  2 +-
 caffe2/python/core.py                         | 16 ++--
 caffe2/python/core_test.py                    | 82 +++++++++----------
 caffe2/python/data_parallel_model.py          |  6 +-
 caffe2/python/hypothesis_test_util.py         |  2 +-
 caffe2/python/model_helper.py                 |  4 +-
 caffe2/python/muji.py                         |  2 +-
 caffe2/python/net_printer.py                  |  4 +-
 caffe2/python/numa_test.py                    |  2 +-
 caffe2/python/onnx/backend_rep.py             |  2 +-
 caffe2/python/operator_test/load_save_test.py |  2 +-
 caffe2/python/operator_test/rnn_cell_test.py  |  2 +-
 caffe2/python/optimizer.py                    | 10 +--
 .../predictor/predictor_exporter_test.py      |  2 +-
 caffe2/python/pybind_state_dlpack.h           |  4 +-
 caffe2/utils/proto_utils.cc                   |  4 +-
 caffe2/utils/proto_utils_test.cc              |  4 +-
 .../pyHIPIFY/cuda_to_hip_mappings.py          |  2 +-
 40 files changed, 119 insertions(+), 119 deletions(-)

diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
index ea8b3494c6a036..4c5313ff4b3032 100644
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -11,7 +11,7 @@ nccl::NCCLExecution getNCCLElements(
   // We either do an N-N op, or an N-1 op.
   CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
   nccl::NCCLExecution ex;
-  ex.stream_gpu_id = context.cuda_gpu_id();
+  ex.stream_gpu_id = context.device_id();
   ex.stream = context.cuda_stream();
   ex.root = op->template GetSingleArgument<int>("root", 0);
   ex.elements.resize(op->InputSize());
@@ -204,7 +204,7 @@ std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
   for (int i = 0; i < def.input().size(); ++i) {
     DeviceOption dev;
     dev.set_device_type(1);
-    dev.set_cuda_gpu_id(i);
+    dev.set_device_id(i);
     opt.push_back(dev);
   }
   return std::make_pair(opt, opt);
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
index 7e8a61e9de241d..f6c22a7d750127 100644
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -21,7 +21,7 @@
 def gpu_device(i):
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.cuda_gpu_id = i
+    device_option.device_id = i
     return device_option
 
 
diff --git a/caffe2/contrib/prof/prof_dag_net.cc b/caffe2/contrib/prof/prof_dag_net.cc
index 16917ddc154fc9..c8678652c3138f 100644
--- a/caffe2/contrib/prof/prof_dag_net.cc
+++ b/caffe2/contrib/prof/prof_dag_net.cc
@@ -33,9 +33,9 @@ void ProfDAGNet::ValidateOpTensorDevices() {
       had_mismatches = true;
       LOG(INFO) << "== PERFORMANCE WARNING == \n"
                 << " Operator " << node.operator_->debug_def().type()
-                << " expects GPU " << mismatch.second.first.cuda_gpu_id()
+                << " expects GPU " << mismatch.second.first.device_id()
                 << " but tensor [" << mismatch.first << "] is on GPU "
-                << mismatch.second.second.cuda_gpu_id();
+                << mismatch.second.second.device_id();
     }
   }
   if (!had_mismatches) {
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index 93ade48e7d267d..cc2c3d85c96877 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -177,7 +177,7 @@ def _tf_device(device_option):
     if device_option.device_type == caffe2_pb2.CPU:
         return "/cpu:*"
     if device_option.device_type == caffe2_pb2.CUDA:
-        return "/gpu:{}".format(device_option.cuda_gpu_id)
+        return "/gpu:{}".format(device_option.device_id)
     raise Exception("Unhandled device", device_option)
 
 
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 25bb0a39e3a965..3b21c8b667473c 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -79,11 +79,11 @@ def test_ctc_cost_cpu(self):
     def test_ctc_cost_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=False)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=False,
             skip_input_lengths=True)
 
@@ -99,10 +99,10 @@ def test_ctc_forward_only_cpu(self):
     def test_ctc_forward_only_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=True)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    cuda_gpu_id=0),
+                                    device_id=0),
             is_test=True,
             skip_input_lengths=True)
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 55eafdede7269a..8b4127e403a452 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -195,7 +195,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     }
     EXPECT_TRUE(tensor_proto.has_device_detail());
     EXPECT_EQ(tensor_proto.device_detail().device_type(), PROTO_CUDA);
-    EXPECT_EQ(tensor_proto.device_detail().cuda_gpu_id(), gpu_id);
+    EXPECT_EQ(tensor_proto.device_detail().device_id(), gpu_id);
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
@@ -205,7 +205,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if we force the restored blob on a different device, we
     // can still get so.
     blob.Reset();
-    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
+    proto.mutable_tensor()->mutable_device_detail()->set_device_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
     EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 0d9e2686212a1e..f10fe067ac746c 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -256,7 +256,7 @@ CUDAContext::CUDAContext(const int gpu_id)
 
 CUDAContext::CUDAContext(const DeviceOption& option)
     : gpu_id_(
-          option.has_cuda_gpu_id() ? RectifyGPUID(option.cuda_gpu_id())
+          option.has_device_id() ? RectifyGPUID(option.device_id())
                                    : CaffeCudaGetDevice()),
       random_seed_(
           option.has_random_seed() ? option.random_seed()
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index ce73f5f942828b..65ba4a006a94af 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -184,7 +184,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     }
   }
 
-  inline int cuda_gpu_id() const {
+  inline int device_id() const {
     return gpu_id_;
   }
 
@@ -283,7 +283,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   }
 
   static bool IsStreamFree(const DeviceOption& option, int stream_id) {
-    auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
+    auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
@@ -393,7 +393,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
     device->set_device_type(TypeToProto(GetDeviceType()));
-    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
+    device->set_device_id(GetGPUIDForPointer(data));
   }
 
  protected:
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index 1bd39fa62a399f..dea138e9ad507c 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -122,9 +122,9 @@ class CuDNNWrapper {
   void with_cudnn_state(size_t state_idx, F&& f) {
     CAFFE_ENFORCE(
         state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
-    auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
+    auto& sync_state = cudnn_states()[context_->device_id()][state_idx];
 
-    DeviceGuard dg(context_->cuda_gpu_id());
+    DeviceGuard dg(context_->device_id());
 
     // We need to serialize execution on the CuDNNState as we can't
     // allow multiple threads to race through the cudaEventRecord
@@ -132,7 +132,7 @@ class CuDNNWrapper {
     // execution)
     std::lock_guard<std::mutex> g(sync_state.mutex);
     if (!sync_state.state.get()) {
-      sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
+      sync_state.state.reset(new CuDNNState(context_->device_id()));
     }
     CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
   }
diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
index 6253ca19c9ab70..44aec8d3f2b8f4 100644
--- a/caffe2/core/event_gpu.cc
+++ b/caffe2/core/event_gpu.cc
@@ -9,21 +9,21 @@ namespace caffe2 {
 struct CudaEventWrapper {
   explicit CudaEventWrapper(const DeviceOption& option)
       : cuda_stream_(nullptr),
-        cuda_gpu_id_(option.cuda_gpu_id()),
+        device_id_(option.device_id()),
         status_(EventStatus::EVENT_INITIALIZED) {
     CAFFE_ENFORCE(option.device_type(), PROTO_CUDA);
-    DeviceGuard g(cuda_gpu_id_);
+    DeviceGuard g(device_id_);
     CUDA_ENFORCE(cudaEventCreate(
         &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
   }
   ~CudaEventWrapper() {
-    DeviceGuard g(cuda_gpu_id_);
+    DeviceGuard g(device_id_);
     CUDA_CHECK(cudaEventDestroy(cuda_event_));
   }
 
   cudaEvent_t cuda_event_;
   cudaStream_t cuda_stream_;
-  int cuda_gpu_id_;
+  int device_id_;
 
   std::atomic<int> status_;
   std::mutex mutex_recorded_;
@@ -65,12 +65,12 @@ void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
       const auto& current_device = CaffeCudaGetDevice();
       CAFFE_ENFORCE_EQ(
           current_device,
-          wrapper->cuda_gpu_id_,
+          wrapper->device_id_,
           "When you call EventRecordCUDA, your current device should be the same "
           "as the device specified by the event.");
       CAFFE_ENFORCE_EQ(
           current_device,
-          static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+          static_cast<const CUDAContext*>(context)->device_id());
       CUDA_ENFORCE(cudaEventRecord(
           wrapper->cuda_event_,
           static_cast<const CUDAContext*>(context)->cuda_stream()));
@@ -96,7 +96,7 @@ void EventFinishCUDA(const Event* event) {
 
   if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
     // ok, even if event is already completed and status was not yet updated
-    DeviceGuard g(wrapper->cuda_gpu_id_);
+    DeviceGuard g(wrapper->device_id_);
     auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
     if (cudaResult == cudaSuccess) {
       wrapper->status_ = EventStatus::EVENT_SUCCESS;
@@ -127,7 +127,7 @@ void EventWaitCUDACUDA(const Event* event, void* context) {
     if (context_stream != event_stream) {
       // CAFFE_ENFORCE_EQ(
       //    CaffeCudaGetDevice(),
-      //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+      //    static_cast<const CUDAContext*>(context)->device_id());
       CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
     }
   }
diff --git a/caffe2/core/hip/event_hip.cc b/caffe2/core/hip/event_hip.cc
index 6f0db4642ddbba..ebec9c593e6eee 100644
--- a/caffe2/core/hip/event_hip.cc
+++ b/caffe2/core/hip/event_hip.cc
@@ -138,7 +138,7 @@ void EventWaitHIPHIP(const Event* event, void* context)
         {
             // CAFFE_ENFORCE_EQ(
             //    CaffeCudaGetDevice(),
-            //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
+            //    static_cast<const CUDAContext*>(context)->device_id());
             HIP_CHECK(hipStreamWaitEvent(context_stream, wrapper->hip_event_, 0));
         }
     }
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index d9816e787ba88c..87633fadebe34e 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -176,7 +176,7 @@ class ComputeBlobRecyclingForDag {
         // cuda device option but whose inputs/outputs are on CPU
         if (net.op(op_index).type() == "CopyGPUToCPU") {
           blob_device_[output].set_device_type(0);
-          blob_device_[output].set_cuda_gpu_id(0);
+          blob_device_[output].set_device_id(0);
         }
       }
     }
@@ -478,7 +478,7 @@ class ComputeBlobRecyclingForDag {
       const DeviceOption& device_option) {
     const DeviceOption& blob_device = blob_device_[blob_name];
     if (device_option.device_type() != blob_device.device_type() ||
-        device_option.cuda_gpu_id() != blob_device.cuda_gpu_id()) {
+        device_option.device_id() != blob_device.device_id()) {
       return false;
     }
     for (const int token : req_tokens_[blob_name]) {
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index ce5fdbe7b7ed80..a694a4865c6cb3 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -157,7 +157,7 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
         numa_node_id);
     return pool_getter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
   } else if (device_option.device_type() == PROTO_CUDA) {
-    auto gpu_id = device_option.cuda_gpu_id();
+    auto gpu_id = device_option.device_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
         "Invalid GPU id: " + caffe2::to_string(gpu_id));
@@ -173,7 +173,7 @@ int AsyncNetBase::stream(int task_id) {
   const auto& device_option = event(task_id).GetDeviceOption();
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.cuda_gpu_id();
+    int gpu_id = device_option.device_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= getStreamCounters().size()) {
       getStreamCounters().resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 550a760826edd8..86d0b4d1d271dc 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -112,7 +112,7 @@ AsyncDAGNet::AsyncDAGNet(
 int AsyncDAGNet::stream(const DeviceOption& device_option) {
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.cuda_gpu_id();
+    int gpu_id = device_option.device_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_gpu_test.cc b/caffe2/core/net_gpu_test.cc
index eaea9377f9bcac..fab56112ec227c 100644
--- a/caffe2/core/net_gpu_test.cc
+++ b/caffe2/core/net_gpu_test.cc
@@ -124,7 +124,7 @@ TEST(NetTest, DISABLED_ChainingForDifferentDevices) {
           type: "NetTestDummy"
           device_option {
             device_type: 1
-            cuda_gpu_id: 1
+            device_id: 1
           }
         }
 )DOC";
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 79be08c03b2325..8115ae3aab6a3c 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -649,7 +649,7 @@ std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
           &blob_device);
 
       if (blob_device.device_type() == PROTO_CUDA &&
-          blob_device.cuda_gpu_id() != op_device.cuda_gpu_id()) {
+          blob_device.device_id() != op_device.device_id()) {
         mismatches[blob_name] = std::make_pair(op_device, blob_device);
       } else if (
           blob_device.device_type() == PROTO_HIP &&
diff --git a/caffe2/mkl/utils/mkl_memory.cc b/caffe2/mkl/utils/mkl_memory.cc
index 3f05f9c5d24bde..9d4f347a13cb81 100644
--- a/caffe2/mkl/utils/mkl_memory.cc
+++ b/caffe2/mkl/utils/mkl_memory.cc
@@ -26,7 +26,7 @@ static vector<int64_t> GetMKLTensorInfo(
   const mkl::MKLMemory<T>* tc = static_cast<const mkl::MKLMemory<T>*>(c);
   *capacity = tc->size() * sizeof(T);
   device->set_device_type(PROTO_MKLDNN);
-  device->set_cuda_gpu_id(0);
+  device->set_device_id(0);
   return tc->dims();
 }
 
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index bf4e20b7904711..5bd9b0a11b0921 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -70,7 +70,7 @@ void ProfileOperatorObserver::Start() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->cuda_gpu_id());
+    cudaSetDevice(context->device_id());
     cudaEventCreate(&start_);
     cudaEventRecord(start_, context->cuda_stream());
 
@@ -92,7 +92,7 @@ void ProfileOperatorObserver::Stop() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->cuda_gpu_id());
+    cudaSetDevice(context->device_id());
     cudaEventCreate(&stop_);
     cudaEventRecord(stop_, context->cuda_stream());
     cudaEventSynchronize(stop_);
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 2350910febff27..8a21fa0acf679c 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -65,7 +65,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
       {DeviceType::CUDA, caffe2::DeviceType::CUDA}};
   caffe2::DeviceOption d;
   d.set_device_type(static_cast<int32_t>(m.at(onnx_device.type)));
-  d.set_cuda_gpu_id(onnx_device.device_id);
+  d.set_device_id(onnx_device.device_id);
   return d;
 }
 
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
index cd70e9c2b5df2f..8458fab901ed8b 100644
--- a/caffe2/operators/load_save_op_gpu.cc
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -8,7 +8,7 @@ void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
   if (proto->has_tensor()) {
     auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
     device_detail->set_device_type(PROTO_CUDA);
-    device_detail->set_cuda_gpu_id(CaffeCudaGetDevice());
+    device_detail->set_device_id(CaffeCudaGetDevice());
   }
 }
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
index e16e2073f7fd12..061f54d3a4cb0e 100644
--- a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
@@ -72,11 +72,11 @@ void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
       if (gpu_id == -1 &&
           rnn_op.op->device_option().device_type() ==
               DeviceTypeProto::PROTO_CUDA) {
-        gpu_id = rnn_op.op->device_option().cuda_gpu_id();
+        gpu_id = rnn_op.op->device_option().device_id();
       } else {
         CAFFE_ENFORCE(
             rnn_op.op->device_option().device_type() == 0 ||
-                rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
+                rnn_op.op->device_option().device_id() == gpu_id,
             "RNN Executor only supports ops on one GPU");
       }
 
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 71870010293492..21bdec2c6883b1 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -135,7 +135,7 @@ message DeviceOption {
   // optional DeviceType device_type = 1 [ default = CPU ];
   optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
   // [CUDA specific] the cuda gpu id.
-  optional int32 cuda_gpu_id = 2;
+  optional int32 device_id = 2;
   // [general] The random seed to start the device random number generator with.
   optional uint32 random_seed = 3;
   // [general] What node this op should execute on.
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index f927020e6ae88f..f9ccf92d75099b 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -236,5 +236,5 @@ def CPU(self):
     def GPU(self, gpu_id=0):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = gpu_id
+        device_option.device_id = gpu_id
         return device_option
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 6850c02fc13964..4f683daa368240 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -84,7 +84,7 @@ def IsOperatorWithEngine(op_type, engine):
 
 def DeviceOption(
     device_type,
-    cuda_gpu_id=0,
+    device_id=0,
     random_seed=None,
     node_name=None,
     numa_node_id=None,
@@ -92,7 +92,7 @@ def DeviceOption(
 ):
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
-    option.cuda_gpu_id = cuda_gpu_id
+    option.device_id = device_id
     if node_name is not None:
         option.node_name = node_name
     if random_seed is not None:
@@ -115,7 +115,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr
     if not opt1.device_type or not opt2.device_type:
         # At least one option is for CPU, check if both are for CPU.
         return not opt1.device_type and not opt2.device_type
-    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
+    return opt1.device_id == opt2.device_id
 
 
 def InferBlobDevices(net):
@@ -2111,7 +2111,7 @@ def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = gpu_id
+        device_option.device_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
@@ -2286,7 +2286,7 @@ def copy_func_between_devices(src, dst):
         return None
 
     if src.device_type == CUDA and dst.device_type == CUDA:
-        if src.cuda_gpu_id == dst.cuda_gpu_id:
+        if src.device_id == dst.device_id:
             return None
         else:
             def fun(net, *args, **kw):
@@ -2312,10 +2312,10 @@ def fun(net, *args, **kw):
 def device_equal(src, dst):
     '''
     We are using this fucntion instead of == operator because optional-value
-    comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
+    comparison between empty device_options and {device_type:0, device_id:0}
     returns not equal in some cases.
     '''
-    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
+    return src.device_type == dst.device_type and src.device_id == dst.device_id
 
 
 def update_placeholder_op_output(op, blob_to_device):
@@ -2429,7 +2429,7 @@ def _gen_new_name(blob, device_option):
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
                         elif device_option.device_type == CUDA:
-                            suffix = '_cuda_' + str(device_option.cuda_gpu_id)
+                            suffix = '_cuda_' + str(device_option.device_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 7120843f33152d..2f6dedbfd80c83 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -83,17 +83,17 @@ def testDeviceScope(self):
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
             self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.device_option.device_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
             override_device.device_type = caffe2_pb2.CPU
@@ -109,13 +109,13 @@ def testDeviceScope(self):
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
                 self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-                self.assertEqual(op.device_option.cuda_gpu_id, 1)
+                self.assertEqual(op.device_option.device_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
                 self.assertEqual(len(op.output), 1)
@@ -255,7 +255,7 @@ class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
             control_input="z", device_option=device_option,
@@ -271,7 +271,7 @@ def testCreate(self):
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertTrue(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
@@ -574,7 +574,7 @@ def test_check_equal_default_value(self):
         opt2 = caffe2_pb2.DeviceOption()
         opt1.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.cuda_gpu_id = 5
+        opt1.device_id = 5
         # opt1 still is on CPU, so the options should be equal
         self.assertTrue(core.device_option_equal(opt1, opt2))
         opt2.device_type = 0
@@ -649,7 +649,7 @@ class TestInferDevice(test_util.TestCase):
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         self.cuda_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
@@ -748,7 +748,7 @@ def test_inject_copy(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
 
@@ -765,7 +765,7 @@ def test_inject_copy(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
         self.assertNotEqual(blob_to_device["fc_w"], device_option)
@@ -775,7 +775,7 @@ def test_cross_nets(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
         const = init_net.ConstantFill([], 'const', shape=[], value=1.)
@@ -791,12 +791,12 @@ def test_cross_nets(self):
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "fc_w_cuda_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "fc_b_cuda_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
@@ -804,7 +804,7 @@ def test_cross_nets(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
@@ -822,7 +822,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -832,7 +832,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -844,7 +844,7 @@ def test_cross_nets(self):
   type: "FC"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -855,7 +855,7 @@ def test_cross_nets(self):
   type: "Add"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 external_input: "data"
@@ -870,7 +870,7 @@ def test_cross_nets_no_change(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
 
         with core.DeviceScope(device_option):
             weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
@@ -887,7 +887,7 @@ def test_cross_nets_no_change(self):
         self.assertEqual(op.input[1], "fc_w")
         self.assertEqual(op.input[2], "fc_b")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         """
 For reference, net.Proto() should be like:
 name: ""
@@ -900,7 +900,7 @@ def test_cross_nets_no_change(self):
   type: "FC"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 external_input: "data"
@@ -912,7 +912,7 @@ def test_inject_copy_multi_use(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
 
         with core.DeviceScope(device_option):
             net.Relu("data", "relu1")
@@ -920,10 +920,10 @@ def test_inject_copy_multi_use(self):
         with core.DeviceScope(device_option):
             net.Relu("data", "relu3")
         net.Relu("data", "relu4")
-        device_option.cuda_gpu_id = 0
+        device_option.device_id = 0
         with core.DeviceScope(device_option):
             net.Relu("data", "relu5")
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
         with core.DeviceScope(device_option):
             net.Relu("data", "relu6")
 
@@ -931,12 +931,12 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "data_cuda_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
         self.assertEqual(op.type, "Relu")
@@ -945,7 +945,7 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
@@ -955,18 +955,18 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "data_cuda_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.input[0], "data_cuda_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu6")
         """
@@ -979,7 +979,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -989,7 +989,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -1005,7 +1005,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 op {
@@ -1021,7 +1021,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    cuda_gpu_id: 0
+    device_id: 0
   }
 }
 op {
@@ -1031,7 +1031,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 0
+    device_id: 0
   }
 }
 op {
@@ -1041,7 +1041,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    cuda_gpu_id: 1
+    device_id: 1
   }
 }
 external_input: "data"
@@ -1060,7 +1060,7 @@ def test_inject_copy_placeholder_ops(self):
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
             gpu_device[i].device_type = caffe2_pb2.CUDA
-            gpu_device[i].cuda_gpu_id = 0
+            gpu_device[i].device_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
         recv_node = 'node:1'
@@ -1100,12 +1100,12 @@ def test_inject_copy_placeholder_ops(self):
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
         self.assertEqual(op.type, placeholder_send)
@@ -1128,7 +1128,7 @@ def test_blob_inplace(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        device_option.device_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
         with core.DeviceScope(device_option):
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 89770dc6ea7d9a..749c8b12c930e8 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -813,7 +813,7 @@ def builder_fun(model):
 
     device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
 
-    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
+    namescope = "{}_{}/".format(device_prefix, device.device_id)
     for op in mnet.Proto().op:
         if "RecurrentNetwork" in op.type:
             raise("RecurrentNetwork conversion not yet supported")
@@ -1540,7 +1540,7 @@ def _AnalyzeOperators(model):
             continue
 
         op_dev = op.device_option
-        op_gpu = op_dev.cuda_gpu_id
+        op_gpu = op_dev.device_id
 
         # This avoids failing on operators that are only for CPU
         if op_dev.device_type != caffe2_pb2.CUDA:
@@ -1904,7 +1904,7 @@ def _InterleaveOps(model):
     new_ops = []
     ops = {d: [] for d in range(num_devices)}
     for op in orig_ops:
-        ops[op.device_option.cuda_gpu_id].append(op)
+        ops[op.device_option.device_id].append(op)
 
     for j in range(num_ops_per_dev):
         tp = None
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 5cc18f99bd9eb9..8470df1588717f 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -259,7 +259,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
-    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
+    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, device_id=i)
      for i in range(workspace.NumCudaDevices())]
     if workspace.has_gpu_support else [])
 
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index f8e3f32bb2c225..1e881d27f49dc8 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -596,7 +596,7 @@ def rename_list(proto_list):
                             rename_list(step_op.output)
                             if device is not None:
                                 step_op.device_option.device_type = device.device_type
-                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                                step_op.device_option.device_id = device.device_id
 
                         rename_list(arg.n.external_input)
                         rename_list(arg.n.external_output)
@@ -610,7 +610,7 @@ def rename_list(proto_list):
 
             if device is not None:
                 op.device_option.device_type = device.device_type
-                op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                op.device_option.device_id = device.device_id
             validate_op(op)
             predict_proto.op.extend([op])
             known_blobs.update(op.output)
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index b407f96d2391f8..2f2b5aced6640e 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -26,7 +26,7 @@ def OnGPU(gpu_id):
   """
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.cuda_gpu_id = gpu_id
+    device_option.device_id = gpu_id
     return device_option
 
 
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 4b5cddb61d244e..7583f863b1f5ad 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -268,11 +268,11 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False):
 
 def format_device_option(dev_opt):
     if not dev_opt or not (
-            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
+            dev_opt.device_type or dev_opt.device_id or dev_opt.node_name):
         return None
     return call(
         'DeviceOption',
-        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
+        [dev_opt.device_type, dev_opt.device_id, "'%s'" % dev_opt.node_name])
 
 
 @Printer.register(OperatorDef)
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
index 8d3a362dcdf725..3178345cf46e21 100644
--- a/caffe2/python/numa_test.py
+++ b/caffe2/python/numa_test.py
@@ -27,7 +27,7 @@ def build_test_net(net_name):
 
     gpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_option.device_type = caffe2_pb2.CUDA
-    gpu_device_option.cuda_gpu_id = 0
+    gpu_device_option.device_id = 0
 
     net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
                         device_option=gpu_device_option)
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index 8cc3f9e2fa98eb..5802e49de526dc 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -24,7 +24,7 @@ def __init__(self, init_net, predict_net, workspace, uninitialized):
     @property
     def _name_scope(self):
         if self.predict_net.device_option.device_type == caffe2_pb2.CUDA:
-            return 'gpu_{}'.format(self.predict_net.device_option.cuda_gpu_id)
+            return 'gpu_{}'.format(self.predict_net.device_option.device_id)
         return ''
 
     def run(self, inputs, **kwargs):
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index 07f378beb18ff0..b90a7f84b4ed8a 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -89,7 +89,7 @@ def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll):
                     self.assertEqual(proto.tensor.device_detail.device_type,
                                      device_type)
                     if device_type == caffe2_pb2.CUDA:
-                        self.assertEqual(proto.tensor.device_detail.cuda_gpu_id,
+                        self.assertEqual(proto.tensor.device_detail.device_id,
                                          gpu_id)
 
             blobs = [str(i) for i in range(len(arrays))]
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 9d9bb38e178517..66ac07dbdca079 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1216,7 +1216,7 @@ def test_lstm_extract_predictor_net(self):
                     if arg.name == "step_net":
                         for step_op in arg.n.op:
                             self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
+                            self.assertEqual(1, step_op.device_option.device_id)
                     elif arg.name == 'backward_step_net':
                         self.assertEqual(caffe2_pb2.NetDef(), arg.n)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 0c5b18b0b6ab11..ddd5871f7d4b74 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -83,7 +83,7 @@ def make_unique_blob_name(self, base_str):
 
         if current_scope.device_type == caffe2_pb2.CUDA:
             return self.get_gpu_blob_name(
-                base_str, current_scope.cuda_gpu_id, current_scope.node_name
+                base_str, current_scope.device_id, current_scope.node_name
             )
         else:
             return self.get_cpu_blob_name(base_str, current_scope.node_name)
@@ -279,7 +279,7 @@ def _run(self, net, param_init_net, param_info):
         # to include device information.
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
+            "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
             shape=[1],
             value=1.0
         )
@@ -488,12 +488,12 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.device_id),
             shape=[1],
             value=1.0
         )
         WD = param_init_net.ConstantFill(
-            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            [], "wd_{}_{}".format(dev.device_type, dev.device_id),
             shape=[1], value=self.weight_decay
         )
 
@@ -1160,7 +1160,7 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.device_id),
             shape=[1],
             value=1.0
         )
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
index b4c71535debe66..ef11246bdfcc9b 100644
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -193,7 +193,7 @@ def test_load_device_scope(self):
 
         # check device options
         for op in list(init_net.Proto().op) + list(predict_init_net.Proto().op):
-            self.assertEqual(1, op.device_option.cuda_gpu_id)
+            self.assertEqual(1, op.device_option.device_id)
             self.assertEqual(caffe2_pb2.CPU, op.device_option.device_type)
 
     def test_db_fails_without_params(self):
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 679152c788132e..6db4ae42b84742 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -34,7 +34,7 @@ class DLPackWrapper {
         "Unsupported device type: ",
         device_option.device_type());
     tensor_context.device_type = *device_type_ptr;
-    tensor_context.device_id = device_option.cuda_gpu_id();
+    tensor_context.device_id = device_option.device_id();
 
     if (tensor->size() <= 0) {
       tensor->Resize(0);
@@ -87,7 +87,7 @@ class DLPackWrapper {
     int dlpack_device_id = dlTensor->ctx.device_id;
     CAFFE_ENFORCE_EQ(
         dlpack_device_id,
-        device_option.cuda_gpu_id(),
+        device_option.device_id(),
         "Expected same device id for DLPack and C2 tensors");
 
     std::vector<int64_t> dims;
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index dc8e088eba97c5..dd80282238a80b 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -30,7 +30,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
     case PROTO_CPU:
       return option.numa_node_id();
     case PROTO_CUDA:
-      return option.cuda_gpu_id();
+      return option.device_id();
     case PROTO_MKLDNN:
       return option.numa_node_id();
     case PROTO_HIP:
@@ -43,7 +43,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
 C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
   return (
       lhs.device_type() == rhs.device_type() &&
-      lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
+      lhs.device_id() == rhs.device_id() &&
       lhs.hip_gpu_id() == rhs.hip_gpu_id() &&
       lhs.node_name() == rhs.node_name() &&
       lhs.numa_node_id() == rhs.numa_node_id());
diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
index c9f37f4c98c290..5d8fb86b34e3bb 100644
--- a/caffe2/utils/proto_utils_test.cc
+++ b/caffe2/utils/proto_utils_test.cc
@@ -11,9 +11,9 @@ TEST(ProtoUtilsTest, IsSameDevice) {
   EXPECT_FALSE(IsSameDevice(a, b));
   b.set_node_name("my_node");
   EXPECT_TRUE(IsSameDevice(a, b));
-  b.set_cuda_gpu_id(2);
+  b.set_device_id(2);
   EXPECT_FALSE(IsSameDevice(a, b));
-  a.set_cuda_gpu_id(2);
+  a.set_device_id(2);
   EXPECT_TRUE(IsSameDevice(a, b));
   a.set_device_type(DeviceTypeProto::PROTO_CUDA);
   b.set_device_type(DeviceTypeProto::PROTO_CPU);
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 113403fd87bbf4..3a98a4cb7d9f3e 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -2216,7 +2216,7 @@
     "CURAND_ENFORCE" :("HIPRAND_ENFORCE", API_CAFFE2),
     "curandGenerateUniform" : ("hiprandGenerateUniform", API_CAFFE2),
     "curand_generator" : ("hiprand_generator", API_CAFFE2),
-    "cuda_gpu_id" : ("hip_gpu_id", API_CAFFE2),
+    "device_id" : ("hip_gpu_id", API_CAFFE2),
     "CaffeCudaGetDevice" : ("CaffeHipGetDevice", API_CAFFE2),
 }
 

From eba1cf2145e52c24538282afa7a2a1aaee053438 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Mon, 1 Oct 2018 12:42:15 -0700
Subject: [PATCH 69/82] Unify style (#11949)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11949

Unify naming style

Reviewed By: yinghai

Differential Revision: D9931227

fbshipit-source-id: b6956bd98ed8625623e4747d616989f9f3a2ed46
---
 caffe2/core/net_async_base.cc | 10 +++++-----
 caffe2/core/net_async_base.h  |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index a694a4865c6cb3..d2fad7cff10b1c 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -119,7 +119,7 @@ bool AsyncNetBase::RunAsync() {
   return DoRunAsync();
 }
 
-TaskThreadPool* AsyncNetBase::pool_getter(
+TaskThreadPool* AsyncNetBase::poolGetter(
     PoolsMap& pools,
     int device_type,
     int device_id,
@@ -136,7 +136,7 @@ TaskThreadPool* AsyncNetBase::pool_getter(
 
 TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
   if (use_single_pool_) {
-    return pool_getter(cpu_pools_, PROTO_CPU, -1, num_workers_);
+    return poolGetter(cpu_pools_, PROTO_CPU, -1, num_workers_);
   }
   static const std::unordered_set<int> cpu_types{
       PROTO_CPU,
@@ -155,13 +155,13 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
         FLAGS_caffe2_net_async_max_numa_nodes,
         "Invalid NUMA node id: ",
         numa_node_id);
-    return pool_getter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
+    return poolGetter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
   } else if (device_option.device_type() == PROTO_CUDA) {
     auto gpu_id = device_option.device_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
         "Invalid GPU id: " + caffe2::to_string(gpu_id));
-    return pool_getter(gpu_pools_, PROTO_CUDA, gpu_id, num_workers_);
+    return poolGetter(gpu_pools_, PROTO_CUDA, gpu_id, num_workers_);
   } else {
     CAFFE_THROW(
         "Unsupported device type " +
@@ -281,7 +281,7 @@ bool AsyncNetBase::testAndSetScheduled(int task_id) {
   return !task_op_node.scheduled_.test_and_set();
 }
 
-int AsyncNetBase::num_ops(int task_id) const {
+int AsyncNetBase::numOps(int task_id) const {
   return chains_[task_id].size();
 }
 
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 29be2c04daccfc..6d76647535c1af 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -65,7 +65,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   int updateParentCount(int child_id);
   int getParentCount(int child_id);
   bool testAndSetScheduled(int task_id);
-  int num_ops(int task_id) const;
+  int numOps(int task_id) const;
 
   void asyncWait(
       int task_id,
@@ -131,7 +131,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   void storeExceptionPtr();
 
   TaskThreadPool*
-  pool_getter(PoolsMap& pools, int device_type, int device_id, int pool_size);
+  poolGetter(PoolsMap& pools, int device_type, int device_id, int pool_size);
 
   std::unique_ptr<AsyncNetExecutorHelper> helper_;
 

From 06f535d8a0e0d21e983fc7715123e47c69a2f12f Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Mon, 1 Oct 2018 12:54:39 -0700
Subject: [PATCH 70/82] More debug info in plan executor (#12183)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12183

Adding more debug info printed from plan executor

Reviewed By: manojkris

Differential Revision: D10113104

fbshipit-source-id: dddc9aec8012c8575ab305033388412fdaaac537
---
 caffe2/core/plan_executor.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 8e48b6b7beabca..51faaed9e7eec9 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -489,7 +489,9 @@ bool RunPlanOnWorkspace(
 
   NetDefMap net_defs;
   for (const NetDef& net_def : plan.network()) {
-    LOG(INFO) << "Processing net '" << net_def.name() << "'";
+    LOG(INFO) << "Processing net '" << net_def.name() << "', type: '"
+              << net_def.type() << "', #ops: " << net_def.op_size()
+              << ", num_workers: " << net_def.num_workers();
     CAFFE_ENFORCE(
         net_defs.count(net_def.name()) == 0,
         "Your plan contains networks of the same name \"",

From 1b59cf8b51e9af35fdbc6cb3ed6751122b8ad189 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Mon, 1 Oct 2018 13:33:31 -0700
Subject: [PATCH 71/82] Add support to use llvm 7 in CI

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12182

Differential Revision: D10129630

Pulled By: bddppq

fbshipit-source-id: f0217336474b807f03f84a4b8052ce92a6e3564b
---
 docker/caffe2/jenkins/common/install_clang.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docker/caffe2/jenkins/common/install_clang.sh b/docker/caffe2/jenkins/common/install_clang.sh
index 694606ec0b91f3..fbf5515bae36d5 100755
--- a/docker/caffe2/jenkins/common/install_clang.sh
+++ b/docker/caffe2/jenkins/common/install_clang.sh
@@ -4,6 +4,13 @@ set -ex
 
 [ -n "$CLANG_VERSION" ]
 
+if [[ "$CLANG_VERSION" == "7" ]]; then
+  apt-get update
+  apt-get install -y --no-install-recommends software-properties-common wget
+  wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+  apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-7 main"
+fi
+
 apt-get update
 apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
 rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

From 15d28e400f71c50848b760ca188ebc885ee433d8 Mon Sep 17 00:00:00 2001
From: Roy Li <royboy@fb.com>
Date: Mon, 1 Oct 2018 13:52:39 -0700
Subject: [PATCH 72/82] remove support for c extensions (#12122)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12122

We are deprecating support for c extensions. Please use cpp extension in the future.

Reviewed By: Yangqing

Differential Revision: D10060541

fbshipit-source-id: 4f7149e06a254bd7af463fd7aa9740f65369963a
---
 .jenkins/pytorch/build.sh   |  11 --
 test/test_utils.py          |  81 +-------------
 torch/utils/ffi/__init__.py | 214 +-----------------------------------
 3 files changed, 4 insertions(+), 302 deletions(-)

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 2dc64157c5d00d..e076b329b28f5b 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -102,17 +102,6 @@ fi
 # Add the test binaries so that they won't be git clean'ed away
 git add -f build/bin
 
-# Test C FFI plugins
-# cffi install doesn't work for Python 3.7
-if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then
-  # TODO: Don't run this here
-  pip install cffi
-  git clone https://github.com/pytorch/extension-ffi.git
-  pushd extension-ffi/script
-  python build.py
-  popd
-fi
-
 # Test documentation build
 if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   pushd docs
diff --git a/test/test_utils.py b/test/test_utils.py
index 971e8a4f05f8e0..dff6102e4579e7 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -25,16 +25,6 @@
 
 from common import TestCase, run_tests, download_file
 
-try:
-    import cffi
-    HAS_CFFI = True
-except ImportError:
-    HAS_CFFI = False
-
-
-if HAS_CFFI:
-    from torch.utils.ffi import create_extension
-
 
 class SimplePlugin(Plugin):
 
@@ -371,74 +361,9 @@ def test_model_gradient(self):
 
 
 class TestFFI(TestCase):
-
-    def setUp(self):
-        self.tmpdir = tempfile.mkdtemp()
-        os.chdir(self.tmpdir)
-        sys.path.append(self.tmpdir)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdir)
-
-    @unittest.skipIf(not HAS_CFFI, "ffi tests require cffi package")
-    @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
-    @unittest.skipIf(IS_PPC, "skip for ppc64le due to incompatible exception handling")
-    def test_cpu(self):
-        create_extension(
-            name='test_extensions.cpulib',
-            headers=[test_dir + '/ffi/src/cpu/lib.h'],
-            sources=[
-                test_dir + '/ffi/src/cpu/lib1.c',
-                test_dir + '/ffi/src/cpu/lib2.c',
-            ],
-            verbose=False,
-        ).build()
-        from test_extensions import cpulib
-        tensor = torch.ones(2, 2).float()
-
-        cpulib.good_func(tensor, 2, 1.5)
-        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
-
-        new_tensor = cpulib.new_tensor(4)
-        self.assertEqual(new_tensor, torch.ones(4, 4) * 4)
-
-        f = cpulib.int_to_float(5)
-        self.assertIs(type(f), float)
-
-        self.assertRaises(TypeError,
-                          lambda: cpulib.good_func(tensor.double(), 2, 1.5))
-        self.assertRaises(torch.FatalError,
-                          lambda: cpulib.bad_func(tensor, 2, 1.5))
-
-    @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package")
-    @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
-    @skipIfRocm
-    def test_gpu(self):
-        from torch.utils.cpp_extension import CUDA_HOME
-        create_extension(
-            name='gpulib',
-            headers=[test_dir + '/ffi/src/cuda/cudalib.h'],
-            sources=[
-                test_dir + '/ffi/src/cuda/cudalib.c',
-            ],
-            with_cuda=True,
-            verbose=False,
-            include_dirs=[os.path.join(CUDA_HOME, 'include')],
-        ).build()
-        import gpulib
-        tensor = torch.ones(2, 2).float()
-
-        gpulib.good_func(tensor, 2, 1.5)
-        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
-
-        ctensor = tensor.cuda().fill_(1)
-        gpulib.cuda_func(ctensor, 2, 1.5)
-        self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)
-
-        self.assertRaises(TypeError,
-                          lambda: gpulib.cuda_func(tensor, 2, 1.5))
-        self.assertRaises(TypeError,
-                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
+    def test_deprecated(self):
+        with self.assertRaisesRegex(ImportError, "torch.utils.ffi is deprecated. Please use cpp extensions instead."):
+            from torch.utils.ffi import create_extension
 
 
 @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
diff --git a/torch/utils/ffi/__init__.py b/torch/utils/ffi/__init__.py
index 086cd99839eb1f..e47a4f8a341705 100644
--- a/torch/utils/ffi/__init__.py
+++ b/torch/utils/ffi/__init__.py
@@ -1,213 +1 @@
-import os
-import glob
-import tempfile
-import shutil
-from functools import wraps, reduce
-from string import Template
-import torch
-import torch.cuda
-from torch._utils import _accumulate
-
-try:
-    import cffi
-except ImportError:
-    raise ImportError("torch.utils.ffi requires the cffi package")
-
-
-if cffi.__version_info__ < (1, 4, 0):
-    raise ImportError("torch.utils.ffi requires cffi version >= 1.4, but "
-                      "got " + '.'.join(map(str, cffi.__version_info__)))
-
-
-def _generate_typedefs():
-    typedefs = []
-    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
-        for lib in ['TH', 'THCuda']:
-            for kind in ['Tensor', 'Storage']:
-                python_name = t + kind
-                if t == 'Float' and lib == 'THCuda':
-                    th_name = 'THCuda' + kind
-                else:
-                    th_name = lib + t + kind
-                th_struct = 'struct ' + th_name
-
-                typedefs += ['typedef {} {};'.format(th_struct, th_name)]
-                # We have to assemble a string here, because we're going to
-                # do this lookup based on tensor.type(), which returns a
-                # string (not a type object, as this code was before)
-                python_module = 'torch.cuda' if lib == 'THCuda' else 'torch'
-                python_class = python_module + '.' + python_name
-                _cffi_to_torch[th_struct] = python_class
-                _torch_to_cffi[python_class] = th_struct
-    return '\n'.join(typedefs) + '\n'
-_cffi_to_torch = {}
-_torch_to_cffi = {}
-_typedefs = _generate_typedefs()
-
-
-PY_MODULE_TEMPLATE = Template("""
-from torch.utils.ffi import _wrap_function
-from .$cffi_wrapper_name import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
-""")
-
-
-def _setup_wrapper(with_cuda):
-    here = os.path.abspath(os.path.dirname(__file__))
-    lib_dir = os.path.join(here, '..', '..', 'lib')
-    include_dirs = [
-        os.path.join(lib_dir, 'include'),
-        os.path.join(lib_dir, 'include', 'TH'),
-    ]
-
-    wrapper_source = '#include <TH/TH.h>\n'
-    if with_cuda:
-        import torch.cuda
-        wrapper_source += '#include <THC/THC.h>\n'
-        if os.sys.platform == 'win32':
-            cuda_include_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/include')
-            cuda_include_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/include')
-        else:
-            cuda_include_dirs = glob.glob('/usr/local/cuda/include')
-            cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
-        include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
-        include_dirs.extend(cuda_include_dirs)
-    return wrapper_source, include_dirs
-
-
-def _create_module_dir(base_path, fullname):
-    module, _, name = fullname.rpartition('.')
-    if not module:
-        target_dir = name
-    else:
-        target_dir = reduce(os.path.join, fullname.split('.'))
-    target_dir = os.path.join(base_path, target_dir)
-    try:
-        os.makedirs(target_dir)
-    except os.error:
-        pass
-    for dirname in _accumulate(fullname.split('.'), os.path.join):
-        init_file = os.path.join(base_path, dirname, '__init__.py')
-        open(init_file, 'a').close()  # Create file if it doesn't exist yet
-    return name, target_dir
-
-
-def _build_extension(ffi, cffi_wrapper_name, target_dir, verbose):
-    try:
-        tmpdir = tempfile.mkdtemp()
-        ext_suf = '.pyd' if os.sys.platform == 'win32' else '.so'
-        libname = cffi_wrapper_name + ext_suf
-        outfile = ffi.compile(tmpdir=tmpdir, verbose=verbose, target=libname)
-        shutil.copy(outfile, os.path.join(target_dir, libname))
-    finally:
-        shutil.rmtree(tmpdir)
-
-
-def _make_python_wrapper(name, cffi_wrapper_name, target_dir):
-    py_source = PY_MODULE_TEMPLATE.substitute(name=name,
-                                              cffi_wrapper_name=cffi_wrapper_name)
-    with open(os.path.join(target_dir, '__init__.py'), 'w') as f:
-        f.write(py_source)
-
-
-def create_extension(name, headers, sources, verbose=True, with_cuda=False,
-                     package=False, relative_to='.', **kwargs):
-    """Creates and configures a cffi.FFI object, that builds PyTorch extension.
-
-    Arguments:
-        name (str): package name. Can be a nested module e.g. ``.ext.my_lib``.
-        headers (str or List[str]): list of headers, that contain only exported
-            functions
-        sources (List[str]): list of sources to compile.
-        verbose (bool, optional): if set to ``False``, no output will be printed
-            (default: True).
-        with_cuda (bool, optional): set to ``True`` to compile with CUDA headers
-            (default: False)
-        package (bool, optional): set to ``True`` to build in package mode (for modules
-            meant to be installed as pip packages) (default: False).
-        relative_to (str, optional): path of the build file. Required when
-            ``package is True``. It's best to use ``__file__`` for this argument.
-        kwargs: additional arguments that are passed to ffi to declare the
-            extension. See `Extension API reference`_ for details.
-
-    .. _`Extension API reference`: https://docs.python.org/3/distutils/apiref.html#distutils.core.Extension
-    """
-    base_path = os.path.abspath(os.path.dirname(relative_to))
-    name_suffix, target_dir = _create_module_dir(base_path, name)
-    if not package:
-        cffi_wrapper_name = '_' + name_suffix
-    else:
-        cffi_wrapper_name = (name.rpartition('.')[0] +
-                             '.{0}._{0}'.format(name_suffix))
-
-    wrapper_source, include_dirs = _setup_wrapper(with_cuda)
-    include_dirs.extend(kwargs.pop('include_dirs', []))
-
-    if os.sys.platform == 'win32':
-        library_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/lib/x64')
-        library_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/lib/x64')
-
-        here = os.path.abspath(os.path.dirname(__file__))
-        lib_dir = os.path.join(here, '..', '..', 'lib')
-
-        library_dirs.append(os.path.join(lib_dir))
-    else:
-        library_dirs = []
-    library_dirs.extend(kwargs.pop('library_dirs', []))
-
-    if isinstance(headers, str):
-        headers = [headers]
-    all_headers_source = ''
-    for header in headers:
-        with open(os.path.join(base_path, header), 'r') as f:
-            all_headers_source += f.read() + '\n\n'
-
-    ffi = cffi.FFI()
-    sources = [os.path.join(base_path, src) for src in sources]
-    # NB: TH headers are C99 now
-    kwargs['extra_compile_args'] = ['-std=c99'] + kwargs.get('extra_compile_args', [])
-    ffi.set_source(cffi_wrapper_name, wrapper_source + all_headers_source,
-                   sources=sources,
-                   include_dirs=include_dirs,
-                   library_dirs=library_dirs, **kwargs)
-    ffi.cdef(_typedefs + all_headers_source)
-
-    _make_python_wrapper(name_suffix, '_' + name_suffix, target_dir)
-
-    def build():
-        _build_extension(ffi, cffi_wrapper_name, target_dir, verbose)
-    ffi.build = build
-    return ffi
-
-
-def _wrap_function(function, ffi):
-    @wraps(function)
-    def safe_call(*args, **kwargs):
-        args = tuple(ffi.cast(_torch_to_cffi.get(arg.type(), 'void') + '*', arg._cdata)
-                     if isinstance(arg, torch.Tensor) or torch.is_storage(arg)
-                     else arg
-                     for arg in args)
-        args = (function,) + args
-        result = torch._C._safe_call(*args, **kwargs)
-        if isinstance(result, ffi.CData):
-            typeof = ffi.typeof(result)
-            if typeof.kind == 'pointer':
-                cdata = int(ffi.cast('uintptr_t', result))
-                cname = typeof.item.cname
-                if cname in _cffi_to_torch:
-                    # TODO: Maybe there is a less janky way to eval
-                    # off of this
-                    return eval(_cffi_to_torch[cname])(cdata=cdata)
-        return result
-    return safe_call
+raise ImportError("torch.utils.ffi is deprecated. Please use cpp extensions instead.")

From 8fa7de35f2700b8b23fcd8f98d2f5cdbabae9c95 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Mon, 1 Oct 2018 15:00:23 -0700
Subject: [PATCH 73/82] Enable ROCM clang-7 build

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12223

Differential Revision: D10133697

Pulled By: bddppq

fbshipit-source-id: c1de99afccdad415ac1beb85d3b8ab44f9b58738
---
 .jenkins/pytorch/enabled-configs.txt | 4 ++--
 caffe2/CMakeLists.txt                | 9 ---------
 test/custom_operator/CMakeLists.txt  | 1 -
 3 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt
index da9f62db38ded2..cffb72aa7acc4f 100644
--- a/.jenkins/pytorch/enabled-configs.txt
+++ b/.jenkins/pytorch/enabled-configs.txt
@@ -40,8 +40,8 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
 pytorch-docker-build-test
 short-perf-test-cpu
 short-perf-test-gpu
-py2-clang3.8-rocm1.7.1-ubuntu16.04-build
-py2-clang3.8-rocm1.7.1-ubuntu16.04-test
+py2-clang7-rocmdeb-ubuntu16.04-build
+py2-clang7-rocmdeb-ubuntu16.04-test
 pytorch-ppc64le-cuda9.2-cudnn7-py3-build
 pytorch-ppc64le-cuda9.2-cudnn7-py3-test
 pytorch-ppc64le-cuda9.1-cudnn7-py3-build
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 885ca028fb2464..07f69d9f7bab98 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -400,9 +400,6 @@ if (BUILD_TEST)
     target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
     target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-    if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
-      target_compile_features(${test_name} PRIVATE cxx_range_for)
-    endif()
     add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
     if (INSTALL_TEST)
       install(TARGETS ${test_name} DESTINATION test)
@@ -416,9 +413,6 @@ if (BUILD_TEST)
       target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
       target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-      if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
-        target_compile_features(${test_name} PRIVATE cxx_range_for)
-      endif()
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if (INSTALL_TEST)
         install(TARGETS ${test_name} DESTINATION test)
@@ -434,9 +428,6 @@ if (BUILD_TEST)
       target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
       target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-      if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
-        target_compile_features(${test_name} PRIVATE cxx_range_for)
-      endif()
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if (INSTALL_TEST)
         install(TARGETS ${test_name} DESTINATION test)
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index f692bdfae123b9..059b004a84840e 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -5,7 +5,6 @@ project(custom_ops)
 find_package(Torch REQUIRED)
 
 add_library(custom_ops SHARED op.cpp)
-target_compile_features(custom_ops PUBLIC cxx_range_for)
 target_link_libraries(custom_ops ${TORCH_LIBRARIES})
 
 add_executable(test_custom_ops test_custom_ops.cpp)

From 35becd1879fae56fc417905c5154c402b9780a3f Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Mon, 1 Oct 2018 15:42:45 -0700
Subject: [PATCH 74/82] New version of PT1 model format (#12149)

Summary:
Considered four different existing formats: 1) static graph, 2) torch script, 3) pickle files, 4) PyTorch C++ serialize APIs
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12149

Reviewed By: BIT-silence

Differential Revision: D10098106

Pulled By: houseroad

fbshipit-source-id: 94ec7fc57c842e50fae5286ddeda657a4967a07a
---
 caffe2/core/blob_serialization.cc |  15 +-
 caffe2/proto/caffe2.proto         |  86 ++++-
 caffe2/proto/torch.proto          | 564 +++---------------------------
 caffe2/python/convert.py          |  56 ---
 caffe2/python/convert_test.py     | 234 -------------
 caffe2/python/pybind_state.cc     |  43 +--
 caffe2/python/workspace.py        |   4 +-
 caffe2/utils/proto_convert.cc     | 181 ----------
 caffe2/utils/proto_convert.h      |  14 -
 9 files changed, 139 insertions(+), 1058 deletions(-)

diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 8126b3d59425a1..f27d16adf342f8 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -308,6 +308,12 @@ void TensorSerializer::Serialize(
             const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
         proto.add_string_data(SerializeBlob(temp_blob, ""));
       }
+    } break;
+    case TensorProto_DataType_SPECIAL: {
+      CAFFE_THROW("SPECIAL Tensor is not handled yet.");
+    } break;
+    case TensorProto_DataType_NO_CONTENT: {
+      CAFFE_THROW("NO_CONTENT Tensor should not be serialized.");
     } break;
       // Note: we intentially do not provide "default:" so if any new data types
       // are added, the compiler should warn the user to add the case here.
@@ -520,7 +526,14 @@ void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
                 (i + chunkBegin) * temp_blob.meta().itemsize(),
             1);
       }
-    }
+    } break;
+    case TensorProto_DataType_SPECIAL: {
+      CAFFE_THROW("SPECIAL Tensor is not handled yet.");
+    } break;
+    case TensorProto_DataType_NO_CONTENT: {
+      CAFFE_THROW("NO_CONTENT Tensor should not be deserialized.");
+    } break;
+      // Note: we intentially do not provide "default:" so if any new data types
   }
   context->FinishDeviceComputation();
 }
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 21bdec2c6883b1..63a2a256ded1ea 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -15,23 +15,46 @@ package caffe2;
 message TensorProto {
   // The dimensions in the tensor.
   repeated int64 dims = 1;
+  // The strides of the tensor.
+  repeated int64 strides = 12;
+
+  // Data type
   enum DataType {
     UNDEFINED = 0;
-    FLOAT = 1;  // float
-    INT32 = 2;  // int
-    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
-    STRING = 4;  // string
-    // Less-commonly used data types.
-    BOOL = 5;  // bool
-    UINT8 = 6;  // uint8_t
-    INT8 = 7;  // int8_t
-    UINT16 = 8;  // uint16_t
-    INT16 = 9;  // int16_t
-    INT64 = 10;  // int64_t
+
+    // Basic types
+    FLOAT = 1;     // float
+    INT32 = 2;     // int
+    BYTE = 3;      // byte, when deserialized, is going to be restored as uint8
+    STRING = 4;    // string
+
+    // Less-commonly used data types
+    BOOL = 5;      // bool
+    UINT8 = 6;     // uint8_t
+    INT8 = 7;      // int8_t
+    UINT16 = 8;    // uint16_t
+    INT16 = 9;     // int16_t
+    INT64 = 10;    // int64_t
     FLOAT16 = 12;  // at::Half
-    DOUBLE = 13;  // double
+    DOUBLE = 13;   // double
+
+    // Special data type, type information is stored in the special type field
+    SPECIAL = 51;
+    // Use TensorProto to specify the shape and type
+    NO_CONTENT = 52;
   }
   optional DataType data_type = 2 [default = FLOAT];
+  // if data_type is SPECIAL, use this field to express the type info
+  optional SpecialType special_type = 13;
+
+  // Data storage
+  enum StorageType {
+    TYPED = 1;
+    RAW = 2;
+    EXTERNAL = 3;
+    ALIAS = 4;
+  }
+  optional StorageType storage_type = 14 [default = TYPED];
   // For float
   repeated float float_data = 3 [packed = true];
   // For int32, uint8, int8, uint16, int16, bool, and float16
@@ -46,6 +69,13 @@ message TensorProto {
   repeated double double_data = 9 [packed = true];
   // For int64
   repeated int64 int64_data = 10 [packed = true];
+  // For raw data
+  optional bytes raw_data = 15;
+  // External data by file name
+  optional string external_data = 16;
+  // For argument, to share the content
+  optional string alias = 17;
+
   // Optionally, a name for the tensor.
   optional string name = 7;
 
@@ -53,13 +83,23 @@ message TensorProto {
   // it was serialized from. This is useful in cases like snapshotting a whole
   // workspace in a multi-GPU environment.
   optional DeviceOption device_detail = 8;
+
   // When loading from chunks this is going to indicate where to put data in the
   // full array. When not used full data have to be present
   message Segment {
     required int64 begin = 1;
     required int64 end = 2;
+    optional int64 chunk_num = 51;
+    optional int64 chunk_id = 52;
   }
   optional Segment segment = 11;
+  optional string debug_info = 18;
+
+  // For PyTorch serialized tensor.
+  optional bool require_gradient = 19;
+  optional bool is_buffer = 20;
+
+  repeated Argument annotations = 21;
 }
 
 message QTensorProto {
@@ -86,7 +126,11 @@ message TensorShape {
   repeated int32 unknown_dims = 3;
   optional bool unknown_shape = 4 [default = false];
   optional string name = 5;
+}
 
+// This is prepared for non-tensor types.
+message SpecialType {
+  optional string name = 1;
 }
 
 message TensorShapes {
@@ -97,13 +141,17 @@ message TensorShapes {
 // values, or repeated float, int and string arrays.
 message Argument {
   optional string name = 1;
+
   optional float f = 2;
   optional int64 i = 3;
   optional bytes s = 4;
+  optional TensorProto t = 10;
   optional NetDef n = 8;
+
   repeated float floats = 5;
   repeated int64 ints = 6;
   repeated bytes strings = 7;
+  repeated TensorProto tensors = 11;
   repeated NetDef nets = 9;
 }
 
@@ -152,7 +200,11 @@ message DeviceOption {
 // Operator Definition.
 message OperatorDef {
   repeated string input = 1; // the name of the input blobs
+  // the input name in the schema, for named inputs
+  repeated string mapped_inputs = 11;
   repeated string output = 2; // the name of output top blobs
+  // the outputname in the schema, for named outputs
+  repeated string mapped_outputs = 12;
   optional string name = 3; // the operator name. This is optional.
   // the operator type. This is needed to create the object from the operator
   // registry.
@@ -186,6 +238,16 @@ message OperatorDef {
   // This is an optional string with no assumed characteristics as
   // operators can be constructed in any language.
   optional string debug_info = 10;
+
+  // additional annotations
+  repeated Argument annotations = 13;
+
+  // for jit ir exporting
+  optional string aten_function = 14;
+
+  // for operator versioning
+  optional string domain = 15;
+  optional string op_version = 16;
 }
 
 // Network definition.
diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto
index 43dfd02b14c8cc..f31c3b65ec1894 100644
--- a/caffe2/proto/torch.proto
+++ b/caffe2/proto/torch.proto
@@ -4,547 +4,77 @@ import "caffe2/proto/caffe2.proto";
 
 package torch;
 
-// Overview
-//
-// ONNX is an open specification that is comprised of the following components:
-//
-// 1)  A definition of an extensible computation graph model.
-// 2)  Definitions of standard data types.
-// 3)  Definitions of built-in operators.
-//
-// This document describes the syntax of models and their computation graphs,
-// as well as the standard data types. Together, they are referred to as the ONNX
-// Intermediate Representation, or 'IR' for short.
-//
-// The normative semantic specification of the ONNX IR is found in docs/IR.md.
-// Definitions of the built-in neural network operators may be found in docs/Operators.md.
-
-// Notes
-//
-// Release
-//
-// We are still in the very early stage of defining ONNX. The current
-// version of ONNX is a starting point. While we are actively working
-// towards a complete spec, we would like to get the community involved
-// by sharing our working version of ONNX.
-//
-// Protobuf compatibility
-//
-// To simplify framework compatibility, ONNX is defined using the subset of
-// protobuf that is compatible with both protobuf v2 and v3. This means that we
-// do not use any protobuf features that are only available in one of the two
-// versions.
-//
-// Here are the most notable contortions we have to carry out to work around
-// these limitations:
-//
-//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
-//     of key-value pairs, where order does not matter and duplicates
-//     are not allowed.
-
-// Versioning
-//
-// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
-//
-// To be compatible with both proto2 and proto3, we will use a version number
-// that is not defined by the default value but an explicit enum number.
-enum Version {
-  // proto3 requires the first enum value to be zero.
-  // We add this just to appease the compiler.
+enum ProtoVersion {
   _START_VERSION = 0;
-  // The version field is always serialized and we will use it to store the
-  // version that the  graph is generated from. This helps us set up version
-  // control.
-  // For the IR, we are using simple numbers starting with with 0x00000001,
-  // which was the version we published on Oct 10, 2017.
-  IR_VERSION_2017_10_10 = 0x0000000000000001;
-
-  // IR_VERSION 2 published on Oct 30, 2017
-  // - Added type discriminator to AttributeProto to support proto3 users
-  IR_VERSION_2017_10_30 = 0x0000000000000002;
-
-  // IR VERSION 3 published on Nov 3, 2017
-  // - For operator versioning:
-  //    - Added new message OperatorSetIdProto
-  //    - Added opset_import in ModelProto
-  // - For vendor extensions, added domain in NodeProto
-  IR_VERSION_NEWEST_ONNX = 0x0000000000000003;
-
-  // PYTORCH IR VERSION
-  IR_VERSION_NEWEST = 0x0000000000000103;
+  IR_VERSION_NEWEST = 0x0000000000000101;
 }
 
-// Attributes
-//
-// A named attribute containing either singular float, integer, string, graph,
-// and tensor values, or repeated float, integer, string, graph, and tensor values.
-// An AttributeProto MUST contain the name field, and *only one* of the
-// following content fields, effectively enforcing a C/C++ union equivalent.
-message AttributeProto {
-
-  // Note: this enum is structurally identical to the OpSchema::AttrType
-  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
-  enum AttributeType {
-    UNDEFINED = 0;
-    FLOAT = 1;
-    INT = 2;
-    STRING = 3;
-    TENSOR = 4;
-    GRAPH = 5;
-
-    FLOATS = 6;
-    INTS = 7;
-    STRINGS = 8;
-    TENSORS = 9;
-    GRAPHS = 10;
-  }
-
-  // The name field MUST be present for this version of the IR.
-  optional string name = 1;           // namespace Attribute
+message MethodDef {
+  // method name
+  optional string name = 1; // method name
 
-  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
-  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
-  // in parent scope.
-  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
-  optional string ref_attr_name = 21;
+  // static graph
+  optional caffe2.NetDef graph = 2;
+  // method is represented as torch script
+  optional string torch_script = 3;
 
-  // A human-readable documentation for this attribute. Markdown is allowed.
-  optional string doc_string = 13;
+  // the names of inputs and outputs
+  repeated string inputs = 4;
+  repeated string outputs = 5;
 
-  // The type field MUST be present for this version of the IR.
-  // For 0.0.1 versions of the IR, this field was not defined, and
-  // implementations needed to use has_field hueristics to determine
-  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
-  // field MUST be set and match the f|i|s|t|... field in use.  This
-  // change was made to accomodate proto3 implementations.
-  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+  // whether this method is main or not.
+  // by default, `forward` should the main method.
+  optional bool is_main = 6;
 
-  // Exactly ONE of the following fields must be present for this version of the IR
-  optional float f = 2;               // float
-  optional int64 i = 3;               // int
-  optional bytes s = 4;               // UTF-8 string
-  optional TensorProto t = 5;         // tensor value
-  optional GraphProto g = 6;          // graph
-  // Do not use field below, it's deprecated.
-  // optional ValueProto v = 12;         // value - subsumes everything but graph
+  optional string debug_info = 7;
 
-  repeated float floats = 7;          // list of floats
-  repeated int64 ints = 8;            // list of ints
-  repeated bytes strings = 9;         // list of UTF-8 strings
-  repeated TensorProto tensors = 10;  // list of tensors
-  repeated GraphProto graphs = 11;    // list of graph
+  repeated caffe2.Argument annotations = 8;
 }
 
-// Defines information on value, including the name, the type, and
-// the shape of the value.
-message ValueInfoProto {
-  // This field MUST be present in this version of the IR.
-  optional string name = 1;     // namespace Value
-  // This field MUST be present in this version of the IR.
-  optional TypeProto type = 2;
-  // A human-readable documentation for this value. Markdown is allowed.
-  optional string doc_string = 3;
-}
-
-// Nodes
-//
-// Computation graphs are made up of a DAG of nodes, which represent what is
-// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
-//
-// For example, it can be a node of type "Conv" that takes in an image, a filter
-// tensor and a bias tensor, and produces the convolved output.
-message NodeProto {
-  repeated string input = 1;    // namespace Value
-  repeated string output = 2;   // namespace Value
 
-  // An optional identifier for this node in a graph.
-  // This field MAY be absent in ths version of the IR.
-  optional string name = 3;     // namespace Node
+message ModuleDef {
+  repeated ModuleDef submodules = 1;
 
-  // The symbolic identifier of the Operator to execute.
-  optional string op_type = 4;  // namespace Operator
-  // The domain of the OperatorSet that specifies the operator named by op_type.
-  optional string domain = 7;   // namespace Domain
+  // We suppose to store the modules in one of the following format:
+  //   - methods (static graph or torch script)
+  //   - pickle
+  //   - cpp_arena
+  repeated MethodDef methods = 2;
+  // because the old pickle modules may not be supported by torch_script,
+  // have to stored as pickle_arena at this moment.
+  optional bytes pickle_arena = 3;
+  // should be exposed by the Class Archive, so user can save
+  // module specific data which cannot be store in the graph or torch_script
+  optional bytes cpp_arena = 4;
 
-  // Additional named attributes.
-  repeated AttributeProto attribute = 5;
+  // the names of inputs and outputs of the module are inferred
+  // from the main method.
 
-  // A human-readable documentation for this node. Markdown is allowed.
-  // Equivalent to string debug_info
-  optional string doc_string = 6;
+  optional string debug_info = 5;
 
-  // Additional annotations, attributes are defined in Schema
-  // To be added as annotations:
-  //    string engine
-  //    string list control_input
-  //    int64 is_gradient_op
-  repeated AttributeProto annotations = 8;
-
-  // Besides the node type, PyTorhc also serialize ATen function signature
-  optional caffe2.DeviceOption device_option = 51;
-  optional string aten_function = 52;
+  repeated caffe2.Argument annotations = 6;
 }
 
-// Models
-//
-// ModelProto is a top-level file/container format for bundling a ML model and
-// associating its computation graph with metadata.
-//
-// The semantics of the model are described by the associated GraphProto.
-//
-// Model ==> Caffe2 MetaNetDef
-//       ==> PyTorch Module
-message ModelProto {
-  // The version of the IR this model targets. See Version enum above.
-  // This field MUST be present.
+message ModelDef {
   optional int64 ir_version = 1;
 
-  // The OperatorSets this model relies on.
-  // All ModelProtos MUST have at least one entry that
-  // specifies which version of the ONNX OperatorSet is
-  // being imported.
-  //
-  // All nodes in the ModelProto's graph will bind against the operator
-  // with the same-domain/same-op_type operator with the HIGHEST version
-  // in the referenced operator sets.
-  repeated OperatorSetIdProto opset_import = 8;
-
-  // The name of the framework or tool used to generate this model.
-  // This field SHOULD be present to indicate which implementation/tool/framework
-  // emitted the model.
-  optional string producer_name = 2;
-
-  // The version of the framework or tool used to generate this model.
-  // This field SHOULD be present to indicate which implementation/tool/framework
-  // emitted the model.
-  optional string producer_version = 3;
-
-  // Domain name of the model.
-  // We use reverse domain names as name space indicators. For example:
-  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
-  //
-  // Together with `model_version` and GraphProto.name, this forms the unique identity of
-  // the graph.
-  optional string domain = 4;
-
-  // The version of the graph encoded. See Version enum below.
-  optional int64 model_version = 5;
-
-  // A human-readable documentation for this model. Markdown is allowed.
-  optional string doc_string = 6;
+  // main module of the model
+  optional ModuleDef main_module = 2;
 
-  // The parameterized graph that is evaluated to execute the model.
-  // The main graph, in single graph case, it is ONNX compatible.
-  optional GraphProto graph = 7;
+  repeated caffe2.TensorProto parameters = 3;
+  repeated caffe2.TensorProto value_infos = 4;
 
-  // The remaining nets in MetaNetDef.
-  // Submodules and methods in PyTorch.
-  repeated GraphProto methods = 15;
-
-  // Named metadata values; keys should be distinct.
-  // Many meta data in MetaNetDef and preditor are piggy backed here.
-  // 1) project
-  // 2) model_class
-  // 3) internal_version
-  // 4) predictor_type
-  // 5) predictor_id
-  // 6) execute_plan
-  // 7) applicationSpecificInfo (another string map, need to verify it has no duplicate.)
-  // 8) engine
-  // 9) publish time
-  repeated StringStringEntryProto metadata_props = 14;
-
-  // Model name
-  optional string name = 16;
-
-  // Model name
-  repeated AttributeProto annotations = 17;
-
-  // Mapping from list name to blob name list, must be string list type.
-  // Equivalent to blobs in MetaNetDef.
-  repeated AttributeProto blob_lists = 51;
-
-  // Mapping from plan name to serialized plan, must be string list type.
-  // Equivalent to plans in MetaNetDef.
-  repeated AttributeProto plans = 52;
-};
-
-// StringStringEntryProto follows the pattern for cross-proto-version maps.
-// See https://developers.google.com/protocol-buffers/docs/proto3#maps
-message StringStringEntryProto {
-  optional string key = 1;
-  optional string value= 2;
-};
-
-// Graphs
-//
-// A graph defines the computational logic of a model and is comprised of a parameterized
-// list of nodes that form a directed acyclic graph based on their inputs and outputs.
-// This is the equivalent of the "network" or "graph" in many deep learning
-// frameworks.
-// Graph ==> NetDef in Caffe2
-//       ==> Submodule/Method in PyTorch
-message GraphProto {
-  // The nodes in the graph, sorted topologically.
-  repeated NodeProto node = 1;
-
-  // The name of the graph.
-  optional string name = 2;   // namespace Graph
-
-  // A list of named tensor values, used to specify constant inputs of the graph.
-  // Each TensorProto entry must have a distinct name (within the list) that
-  // also appears in the input list.
-  repeated TensorProto initializer = 5;
-
-  // A human-readable documentation for this graph. Markdown is allowed.
-  optional string doc_string = 10;
-
-  // The inputs and outputs of the graph.
-  repeated ValueInfoProto input = 11;
-  repeated ValueInfoProto output = 12;
-
-  // Information for the values in the graph. The ValueInfoProto.name's
-  // must be distinct. It is optional for a value to appear in value_info list.
-  repeated ValueInfoProto value_info = 13;
-
-  // Additional annotations.
-  repeated AttributeProto annotations = 14;
-
-  // DO NOT USE the following fields, they were deprecated from earlier versions.
-  // repeated string input = 3;
-  // repeated string output = 4;
-  // optional int64 ir_version = 6;
-  // optional int64 producer_version = 7;
-  // optional string producer_tag = 8;
-  // optional string domain = 9;
-}
+  // to distinguish whether exported from c2 or torch
+  optional string producer_name = 5;
 
-// Tensors
-//
-// A serialized tensor value.
-message TensorProto {
-  enum DataType {
-    UNDEFINED = 0;
-    // Basic types.
-    FLOAT = 1;   // float
-    UINT8 = 2;   // uint8_t
-    INT8 = 3;    // int8_t
-    UINT16 = 4;  // uint16_t
-    INT16 = 5;   // int16_t
-    INT32 = 6;   // int32_t
-    INT64 = 7;   // int64_t
-    STRING = 8;  // string
-    BOOL = 9;    // bool
+  // put build version here
+  optional string producer_version = 6;
 
-    // Advanced types
-    FLOAT16 = 10;
-    DOUBLE = 11;
-    UINT32 = 12;
-    UINT64 = 13;
-    COMPLEX64 = 14;     // complex with float32 real and imaginary components
-    COMPLEX128 = 15;    // complex with float64 real and imaginary components
-    // Future extensions go here.
+  optional string name = 7;
 
-    // Special data type, real type information is stored in ValueInfoProto.
-    // If data_type is SPECIAL, raw_data should be used.
-    SPECIAL = 51;
-  }
-
-  // The shape of the tensor.
-  repeated int64 dims = 1;
-  repeated int64 strides = 14;
-
-  // The data type of the tensor.
-  optional DataType data_type = 2;
-
-  // For very large tensors, we may want to store them in chunks, in which
-  // case the following fields will specify the segment that is stored in
-  // the current TensorProto.
-  message Segment {
-    optional int64 begin = 1;
-    optional int64 end = 2;
-    optional int64 chuck_num = 51;
-    optional int64 chuck_id = 52;
-  }
-  // Used as offset in the external shared data.
-  optional Segment segment = 3;
-
-  // Tensor content must be organized in row-major order.
-  //
-  // Depending on the data_type field, exactly one of the fields below with
-  // name ending in _data is used to store the elements of the tensor.
-
-  // For float and complex64 values
-  // Complex64 tensors are encoded as a single array of floats,
-  // with the real components appearing in odd numbered positions,
-  // and the corresponding imaginary component apparing in the
-  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
-  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
-  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
-  repeated float float_data = 4 [packed = true];
-
-  // For int32, uint8, int8, uint16, int16, bool, and Half values
-  // float16 values must be bit-wise converted to an uint16_t prior
-  // to writing to the buffer.
-  // When this field is present, the data_type field MUST be
-  // INT32, INT16, INT8, UINT16, INT8, BOOL, or FLOAT16
-  repeated int32 int32_data = 5 [packed = true];
-
-  // For strings.
-  // Each element of string_data is a UTF-8 encoded Unicode
-  // string. No trailing null, no leading BOM. The protobuf "string"
-  // scalar type is not used to match ML community conventions.
-  // When this field is present, the data_type field MUST be STRING
-  repeated bytes string_data = 6;
-
-  // For int64.
-  // When this field is present, the data_type field MUST be INT64
-  repeated int64 int64_data = 7 [packed = true];
-
-  // Optionally, a name for the tensor.
-  optional string name = 8; // namespace Value
-
-  // A human-readable documentation for this tensor. Markdown is allowed.
-  optional string doc_string = 12;
-
-  // Serializations can either use one of the fields above, or use this
-  // raw bytes field. The only exception is the string case, where one is
-  // required to store the content in the repeated bytes string_data field.
-  //
-  // When this raw_data field is used to store tensor value, elements MUST
-  // be stored in as fixed-width, little-endian order.
-  // Floating-point data types MUST be stored in IEEE 754 format.
-  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
-  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
-  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
-  //
-  // Note: the advantage of specific field rather than the raw_data field is
-  // that in some cases (e.g. int data), protobuf does a better packing via
-  // variable length storage, and may lead to smaller binary footprint.
-  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
-  optional bytes raw_data = 9;
-
-  // For double
-  // Complex64 tensors are encoded as a single array of doubles,
-  // with the real components appearing in odd numbered positions,
-  // and the corresponding imaginary component apparing in the
-  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
-  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
-  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
-  repeated double double_data = 10 [packed = true];
-
-  // For uint64 and uint32 values
-  // When this field is present, the data_type field MUST be
-  // UINT32 or UINT64
-  repeated uint64 uint64_data = 11 [packed = true];
-
-  // External data by file name
-  optional string external_data = 13;
-
-  // If two tensors represent the same weights/content, use alias.
-  // Must exist a TensorProto named alias in the initializer list.
-  // To avoid the duplicate tensor in attribute, such as value in Constant node.
-  // This is useful, if everything is stored just in the proto.
-  optional string alias = 16;
-
-  // Additional annotations.
-  repeated AttributeProto annotations = 17;
-
-  // Device info
-  optional caffe2.DeviceOption device_option = 51;
-
-  // For PyTorch serialized tensor.
-  optional int64 require_gradient = 52;
-  optional int64 is_buffer = 53;
-}
-
-// Defines a tensor shape. A dimension can be either an integer value
-// or a symbolic variable. A symbolic variable represents an unknown
-// dimension.
-message TensorShapeProto {
-  message Dimension {
-    oneof value {
-      int64 dim_value = 1;
-      string dim_param = 2;   // namespace Shape
-    };
-    // Standard denotation can optionally be used to denote tensor
-    // dimensions with standard semantic descriptions to ensure
-    // that operations are applied to the correct axis of a tensor.
-    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
-    // for pre-defined dimension denotations.
-    optional string denotation = 3;
-  };
-  // To represent a scalar, using no dim to represent 0-d tensor.
-  repeated Dimension dim = 1;
-
-  repeated Dimension stride = 51;
-}
-
-// Types
-//
-// The standard ONNX data types.
-message TypeProto {
-
-  message Tensor {
-    // This field MUST NOT have the value of UNDEFINED
-    // This field MUST be present for this version of the IR.
-    optional TensorProto.DataType elem_type = 1;
-    optional TensorShapeProto shape = 2;
-  }
-
-  // Sequence type: List, Tuple
-  message Sequence {
-    // elem_type and elem_type_list cannot appear together.
-    // If all the element types are the same, we use elem_type,
-    // otherwise, we specify the type of each element in elem_type_list.
-    optional TypeProto elem_type = 1;
-    repeated TypeProto elem_type_list = 51;
-    enum SequenceType {
-      UNDEFINED = 0;
-      LIST = 1;
-      TUPLE = 2;
-    }
-    optional SequenceType sequence_type = 52;
-  }
-
-  // Map<K, V>, (not necessary at this moment)
-  message Map {
-    optional TensorProto.DataType key_type = 1;
-    optional TypeProto value_type = 2;
-  }
-
-  // Special type of blobs, based on the type_name, we can choose the right
-  // serializer and deserialzier.
-  message SpecialBlob {
-    optional string type_name = 1;
-  }
-
-  oneof value {
-    // The type of a tensor.
-    Tensor tensor_type = 1;
-    Sequence sequence_type = 4;
-    Map map_type = 5;
-    SpecialBlob special_type = 51;
-  }
-
-  // An optional denotation can be used to denote the whole
-  // type with a standard semantic description as to what is
-  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
-  // for pre-defined type denotations.
-  optional string denotation = 6;
-}
+  optional string debug_info = 8;
 
-// Operator Sets
-//
-// OperatorSets are uniquely identified by a (domain, opset_version) pair.
-message OperatorSetIdProto {
-  // The domain of the operator set being identified.
-  // The empty string ("") or absence of this field implies the operator
-  // set that is defined as part of the ONNX specification.
-  // This field MUST be present in this version of the IR when referring to any other operator set.
-  optional string domain = 1;
+  // annotations, it is used for MetaNetDef's metadata
+  repeated caffe2.Argument annotations = 9;
 
-  // The version of the operator set being identified.
-  // This field MUST be present in this version of the IR.
-  optional int64 version = 2;
 }
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
index 50eaf220c7f721..44f81d6e2d135e 100644
--- a/caffe2/python/convert.py
+++ b/caffe2/python/convert.py
@@ -8,59 +8,3 @@
 from caffe2.proto import caffe2_pb2, torch_pb2
 
 import caffe2.python._import_c_extension as C
-
-
-def ArgumentToAttributeProto(arg):
-    serialized_arg = None
-    if hasattr(arg, 'SerializeToString') and callable(arg.SerializeToString):
-        serialized_arg = arg.SerializeToString()
-    elif isinstance(arg, bytes):
-        serialized_arg = arg
-    else:
-        raise ValueError('No SerializeToString method is detected. '
-                         'neither arg is bytes.\ntype is {}'.format(type(arg)))
-    attr = torch_pb2.AttributeProto()
-    attr.ParseFromString(C.argument_to_attribute_proto(serialized_arg))
-    return attr
-
-
-def AttributeProtoToArgument(attr):
-    serialized_attr = None
-    if hasattr(attr, 'SerializeToString') and callable(attr.SerializeToString):
-        serialized_attr = attr.SerializeToString()
-    elif isinstance(attr, bytes):
-        serialized_attr = attr
-    else:
-        raise ValueError('No SerializeToString method is detected. '
-                         'neither attr is bytes.\ntype is {}'.format(type(attr)))
-    arg = caffe2_pb2.Argument()
-    arg.ParseFromString(C.attribute_proto_to_argument(serialized_attr))
-    return arg
-
-
-def OperatorDefToNodeProto(op_def):
-    serialized_op_def = None
-    if hasattr(op_def, 'SerializeToString') and callable(op_def.SerializeToString):
-        serialized_op_def = op_def.SerializeToString()
-    elif isinstance(op_def, bytes):
-        serialized_op_def = op_def
-    else:
-        raise ValueError('No SerializeToString method is detected. '
-                         'neither op_def is bytes.\ntype is {}'.format(type(op_def)))
-    node = torch_pb2.NodeProto()
-    node.ParseFromString(C.operator_def_to_node_proto(serialized_op_def))
-    return node
-
-
-def NodeProtoToOperatorDef(node_proto):
-    serialized_node_proto = None
-    if hasattr(node_proto, 'SerializeToString') and callable(node_proto.SerializeToString):
-        serialized_node_proto = node_proto.SerializeToString()
-    elif isinstance(node_proto, bytes):
-        serialized_node_proto = node_proto
-    else:
-        raise ValueError('No SerializeToString method is detected. '
-                         'neither node_proto is bytes.\ntype is {}'.format(type(node_proto)))
-    op_def = caffe2_pb2.OperatorDef()
-    op_def.ParseFromString(C.node_proto_to_operator_def(serialized_node_proto))
-    return op_def
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
index c8de7e9750680f..82c969c901ea61 100644
--- a/caffe2/python/convert_test.py
+++ b/caffe2/python/convert_test.py
@@ -12,239 +12,5 @@ class TestOperator(unittest.TestCase):
     def setUp(self):
         workspace.ResetWorkspace()
 
-    def testArgument2AttributeProto(self):
-        arg_f = caffe2_pb2.Argument()
-        arg_f.name = "TestArgF"
-        arg_f.f = 10.0
-        attr_f = convert.ArgumentToAttributeProto(arg_f)
-        self.assertEqual(attr_f.name, arg_f.name)
-        self.assertEqual(attr_f.f, arg_f.f)
-
-        arg_i = caffe2_pb2.Argument()
-        arg_i.name = "TestArgI"
-        arg_i.i = 100
-        attr_i = convert.ArgumentToAttributeProto(arg_i)
-        self.assertEqual(attr_i.name, arg_i.name)
-        self.assertEqual(attr_i.i, arg_i.i)
-
-        arg_s = caffe2_pb2.Argument()
-        arg_s.name = "TestArgS"
-        arg_s.s = "TestS".encode("utf-8")
-        attr_s = convert.ArgumentToAttributeProto(arg_s)
-        self.assertEqual(attr_s.name, arg_s.name)
-        self.assertEqual(attr_s.s, arg_s.s)
-
-        # TODO: test net arg
-
-        arg_floats = caffe2_pb2.Argument()
-        arg_floats.name = "TestArgFloats"
-        arg_floats.floats.extend([10.0, 11.0, 12.0])
-        attr_floats = convert.ArgumentToAttributeProto(arg_floats)
-        self.assertEqual(attr_floats.name, arg_floats.name)
-        self.assertEqual(attr_floats.floats, arg_floats.floats)
-
-        arg_ints = caffe2_pb2.Argument()
-        arg_ints.name = "TestArgInts"
-        arg_ints.ints.extend([100, 101, 102])
-        attr_ints = convert.ArgumentToAttributeProto(arg_ints)
-        self.assertEqual(attr_ints.name, arg_ints.name)
-        self.assertEqual(attr_ints.ints, arg_ints.ints)
-
-        arg_strings = caffe2_pb2.Argument()
-        arg_strings.name = "TestArgStrings"
-        arg_strings.strings.extend([
-            "TestStrings1".encode("utf-8"),
-            "TestStrings2".encode("utf-8"),
-        ])
-        attr_strings = convert.ArgumentToAttributeProto(arg_strings)
-        self.assertEqual(attr_strings.name, arg_strings.name)
-        self.assertEqual(attr_strings.strings, arg_strings.strings)
-
-        # TODO: test nets arg
-
-    def testAttributeProto2Argument(self):
-        attr_f = torch_pb2.AttributeProto()
-        attr_f.type = torch_pb2.AttributeProto.FLOAT
-        attr_f.name = "TestAttrF"
-        attr_f.f = 10.0
-        arg_f = convert.AttributeProtoToArgument(attr_f)
-        self.assertEqual(arg_f.name, attr_f.name)
-        self.assertEqual(arg_f.f, attr_f.f)
-
-        attr_i = torch_pb2.AttributeProto()
-        attr_i.type = torch_pb2.AttributeProto.INT
-        attr_i.name = "TestArgI"
-        attr_i.i = 100
-        arg_i = convert.AttributeProtoToArgument(attr_i)
-        self.assertEqual(arg_i.name, attr_i.name)
-        self.assertEqual(arg_i.i, attr_i.i)
-
-        attr_s = torch_pb2.AttributeProto()
-        attr_s.type = torch_pb2.AttributeProto.STRING
-        attr_s.name = "TestArgS"
-        attr_s.s = "TestS".encode("utf-8")
-        arg_s = convert.AttributeProtoToArgument(attr_s)
-        self.assertEqual(arg_s.name, attr_s.name)
-        self.assertEqual(arg_s.s, attr_s.s)
-
-        # TODO: test graph attribute
-
-        attr_floats = torch_pb2.AttributeProto()
-        attr_floats.type = torch_pb2.AttributeProto.FLOATS
-        attr_floats.name = "TestAttrFloats"
-        attr_floats.floats.extend([10.0, 11.0, 12.0])
-        arg_floats = convert.AttributeProtoToArgument(attr_floats)
-        self.assertEqual(arg_floats.name, attr_floats.name)
-        self.assertEqual(arg_floats.floats, attr_floats.floats)
-
-        attr_ints = torch_pb2.AttributeProto()
-        attr_ints.type = torch_pb2.AttributeProto.INTS
-        attr_ints.name = "TestArgInts"
-        attr_ints.ints.extend([100, 101, 102])
-        arg_ints = convert.AttributeProtoToArgument(attr_ints)
-        self.assertEqual(arg_ints.name, attr_ints.name)
-        self.assertEqual(arg_ints.ints, attr_ints.ints)
-
-        attr_strings = torch_pb2.AttributeProto()
-        attr_strings.type = torch_pb2.AttributeProto.STRINGS
-        attr_strings.name = "TestArgStrings"
-        attr_strings.strings.extend([
-            "TestStrings1".encode("utf-8"),
-            "TestStrings2".encode("utf-8"),
-        ])
-        arg_strings = convert.AttributeProtoToArgument(attr_strings)
-        self.assertEqual(arg_strings.name, attr_strings.name)
-        self.assertEqual(arg_strings.strings, attr_strings.strings)
-
-        # TODO: test graphs attribute
-
-
-    def testOperatorDef2NodeProto(self):
-        op_def = caffe2_pb2.OperatorDef()
-        op_def.input.extend(["A", "B", "C"])
-        op_def.output.extend(["X", "Y"])
-        op_def.name = "TestOpName"
-        op_def.type = "TestOp"
-        arg1 = caffe2_pb2.Argument()
-        arg1.name = "TestArg1"
-        arg1.i = 1
-        arg2 = caffe2_pb2.Argument()
-        arg2.name = "TestArg2"
-        arg1.s = "TestInfo".encode("utf-8")
-        op_def.arg.extend([arg1, arg2])
-        op_def.device_option.CopyFrom(caffe2_pb2.DeviceOption())
-        op_def.engine = "TestEngine".encode("utf-8")
-        op_def.control_input.extend(["input1", "input2"])
-        op_def.is_gradient_op = True
-        op_def.debug_info = "TestDebugInfo"
-
-        node = convert.OperatorDefToNodeProto(op_def)
-
-        self.assertEqual(node.input, op_def.input)
-        self.assertEqual(node.output, op_def.output)
-        self.assertEqual(node.name, op_def.name)
-        self.assertEqual(node.op_type, op_def.type)
-        self.assertEqual(node.attribute[0].name, op_def.arg[0].name)
-        self.assertEqual(node.attribute[1].name, op_def.arg[1].name)
-        self.assertEqual(node.device_option, op_def.device_option)
-        node_engine = [a.s.decode("utf-8") for a in node.annotations if a.name == "engine"][0]
-        self.assertEqual(node_engine, op_def.engine)
-        node_control_input = [a.strings for a in node.annotations if a.name == "control_input"][0]
-        self.assertEqual(len(node_control_input), len(op_def.control_input))
-        for x, y in zip(node_control_input, op_def.control_input):
-            self.assertEqual(x.decode("utf-8"), y)
-        self.assertEqual(node.doc_string, op_def.debug_info)
-        node_is_gradient_op = [a.i for a in node.annotations if a.name == "is_gradient_op"][0]
-        self.assertEqual(node_is_gradient_op, int(op_def.is_gradient_op))
-
-    def testNodeProto2OperatorDef(self):
-        node = torch_pb2.NodeProto()
-        node.input.extend(["A", "B", "C"])
-        node.output.extend(["X", "Y"])
-        node.name = "TestOpName"
-        node.op_type = "TestOp"
-        attr1 = torch_pb2.AttributeProto()
-        attr1.name = "TestAttr1"
-        attr1.type = torch_pb2.AttributeProto.STRING
-        attr1.s = "TestInfo".encode("utf-8")
-        attr2 = torch_pb2.AttributeProto()
-        attr2.name = "TestAttr2"
-        attr2.type = torch_pb2.AttributeProto.INT
-        attr2.i = 10
-        node.attribute.extend([attr1, attr2])
-        node.device_option.CopyFrom(caffe2_pb2.DeviceOption())
-        anno1 = torch_pb2.AttributeProto()
-        anno1.name = "engine"
-        anno1.type = torch_pb2.AttributeProto.STRING
-        anno1.s = "TestEngine".encode("utf-8")
-        anno2 = torch_pb2.AttributeProto()
-        anno2.name = "control_input"
-        anno2.type = torch_pb2.AttributeProto.STRINGS
-        anno2.strings.extend(["input1".encode("utf-8"), "input2".encode("utf-8")])
-        anno3 = torch_pb2.AttributeProto()
-        anno3.name = "is_gradient_op"
-        anno3.type = torch_pb2.AttributeProto.INT
-        anno3.i = 1
-        node.annotations.extend([anno1, anno2, anno3])
-        node.doc_string = "TestDocString".encode("utf-8")
-
-        op_def = convert.NodeProtoToOperatorDef(node)
-
-        self.assertEqual(op_def.input, node.input)
-        self.assertEqual(op_def.output, node.output)
-        self.assertEqual(op_def.name, node.name)
-        self.assertEqual(op_def.type, node.op_type)
-        self.assertEqual(op_def.arg[0].name, node.attribute[0].name)
-        self.assertEqual(op_def.arg[1].name, node.attribute[1].name)
-        self.assertEqual(op_def.device_option, node.device_option)
-        node_engine = [a.s for a in node.annotations if a.name == "engine"][0]
-        self.assertEqual(op_def.engine, node_engine.decode("utf-8"))
-        node_control_input = [a.strings for a in node.annotations if a.name == "control_input"][0]
-        for x, y in zip(op_def.control_input, node_control_input):
-            self.assertEqual(x, y.decode("utf-8"))
-        self.assertEqual(op_def.debug_info, node.doc_string)
-        node_is_gradient_op = [a.i for a in node.annotations if a.name == "is_gradient_op"][0]
-        self.assertEqual(int(op_def.is_gradient_op), node_is_gradient_op)
-
-    def testEnd2End(self):
-        op_def = caffe2_pb2.OperatorDef()
-        op_def.type = "Add"
-        op_def.input.extend(["input1"])
-        op_def.input.extend(["input2"])
-        op_def.output.extend(["output1"])
-        node = convert.OperatorDefToNodeProto(op_def)
-
-        input1 = np.random.randn(1, 3, 1, 5).astype(np.float32)
-        input2 = np.random.randn(2, 1, 4, 1).astype(np.float32)
-        ref_output1 = input1 + input2
-        workspace.FeedBlob("input1", input1)
-        workspace.FeedBlob("input2", input2)
-        self.assertEqual(workspace.RunOperatorOnce(node.SerializeToString(), legacy_proto=False), True)
-
-        self.assertEqual(workspace.HasBlob("output1"), True)
-        fetched_back = workspace.FetchBlob("output1")
-        np.testing.assert_array_equal(fetched_back, ref_output1)
-
-    def testRoundTrip(self):
-        op_def = caffe2_pb2.OperatorDef()
-        op_def.type = "Add"
-        op_def.input.extend(["input1"])
-        op_def.input.extend(["input2"])
-        op_def.output.extend(["output1"])
-        node = convert.OperatorDefToNodeProto(op_def)
-        new_op_def = convert.NodeProtoToOperatorDef(node)
-
-        input1 = np.random.randn(1, 3, 1, 5).astype(np.float32)
-        input2 = np.random.randn(2, 1, 4, 1).astype(np.float32)
-        ref_output1 = input1 + input2
-        workspace.FeedBlob("input1", input1)
-        workspace.FeedBlob("input2", input2)
-        self.assertEqual(workspace.RunOperatorOnce(new_op_def.SerializeToString()), True)
-
-        self.assertEqual(workspace.HasBlob("output1"), True)
-        fetched_back = workspace.FetchBlob("output1")
-        np.testing.assert_array_equal(fetched_back, ref_output1)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 7062ead045df1c..7ebee57d490848 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -1187,17 +1187,10 @@ void addGlobalMethods(py::module& m) {
     return true;
   });
   m.def("nets", []() { return gWorkspace->Nets(); });
-  m.def("run_operator_once", [](const py::bytes& op_def, bool legacy_proto=true) {
+  m.def("run_operator_once", [](const py::bytes& op_def) {
     CAFFE_ENFORCE(gWorkspace);
     OperatorDef def;
-    if (legacy_proto) {
-      CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
-    } else {
-      ::torch::NodeProto node;
-      CAFFE_ENFORCE(
-          ParseProtoFromLargeString(op_def.cast<std::string>(), &node));
-      NodeProtoToOperatorDef(node, &def);
-    }
+    CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
     py::gil_scoped_release g;
     CAFFE_ENFORCE(gWorkspace->RunOperatorOnce(def));
     return true;
@@ -1534,38 +1527,6 @@ void addGlobalMethods(py::module& m) {
     CAFFE_ENFORCE(blob);
     return BlobStat::sizeBytes(*blob);
   });
-  m.def("argument_to_attribute_proto", [](py::bytes arg_str) -> py::bytes {
-    Argument arg;
-    CAFFE_ENFORCE(
-      ParseProtoFromLargeString(arg_str.cast<std::string>(), &arg));
-    ::torch::AttributeProto attr;
-    ArgumentToAttributeProto(arg, &attr);
-    return attr.SerializeAsString();
-  });
-  m.def("attribute_proto_to_argument", [](py::bytes attr_str) -> py::bytes {
-    ::torch::AttributeProto attr;
-    CAFFE_ENFORCE(
-      ParseProtoFromLargeString(attr_str.cast<std::string>(), &attr));
-    Argument arg;
-    AttributeProtoToArgument(attr, &arg);
-    return arg.SerializeAsString();
-  });
-  m.def("operator_def_to_node_proto", [](py::bytes op_str) -> py::bytes {
-    OperatorDef op_def;
-    CAFFE_ENFORCE(
-      ParseProtoFromLargeString(op_str.cast<std::string>(), &op_def));
-    ::torch::NodeProto node;
-    OperatorDefToNodeProto(op_def, &node);
-    return node.SerializeAsString();
-  });
-  m.def("node_proto_to_operator_def", [](py::bytes node_str) -> py::bytes {
-    ::torch::NodeProto node_proto;
-    CAFFE_ENFORCE(
-      ParseProtoFromLargeString(node_str.cast<std::string>(), &node_proto));
-    OperatorDef op_def;
-    NodeProtoToOperatorDef(node_proto, &op_def);
-    return op_def.SerializeAsString();
-  });
   m.def("support_onnx_export", [](const std::string& op) -> bool {
     const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(op);
     if (!schema) {
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index a41cc153177639..ef02f64dc993b7 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -163,8 +163,8 @@ def GetOperatorCost(operator, blobs):
     return C.get_operator_cost(StringifyProto(operator), blobs)
 
 
-def RunOperatorOnce(operator, legacy_proto=True):
-    return C.run_operator_once(StringifyProto(operator), legacy_proto)
+def RunOperatorOnce(operator):
+    return C.run_operator_once(StringifyProto(operator))
 
 
 def RunOperatorsOnce(operators):
diff --git a/caffe2/utils/proto_convert.cc b/caffe2/utils/proto_convert.cc
index 790bd274291dcb..1d69c8c80c15ac 100644
--- a/caffe2/utils/proto_convert.cc
+++ b/caffe2/utils/proto_convert.cc
@@ -2,185 +2,4 @@
 #include "caffe2/core/logging.h"
 
 namespace caffe2 {
-
-C10_EXPORT void ArgumentToAttributeProto(
-    const Argument& arg,
-    ::torch::AttributeProto* attr) {
-  CAFFE_ENFORCE(arg.has_name());
-  attr->set_name(arg.name());
-  if (arg.has_f()) {
-    attr->set_f(arg.f());
-  } else if (arg.has_i()) {
-    attr->set_i(arg.i());
-  } else if (arg.has_s()) {
-    attr->set_s(arg.s());
-  } else if (arg.has_n()) {
-    // TODO
-    CAFFE_THROW("NetDef conversion is not implemented yet.");
-  } else if (arg.floats_size() > 0) {
-    attr->mutable_floats()->CopyFrom(arg.floats());
-  } else if (arg.ints_size() > 0) {
-    attr->mutable_ints()->CopyFrom(arg.ints());
-  } else if (arg.strings_size() > 0) {
-    attr->mutable_strings()->CopyFrom(arg.strings());
-  } else if (arg.nets_size() > 0) {
-    // TODO
-    CAFFE_THROW("NetDefs conversion is not implemented yet.");
-  }
-}
-
-C10_EXPORT void AttributeProtoToArgument(
-    const ::torch::AttributeProto& attr,
-    Argument* arg) {
-  CAFFE_ENFORCE(attr.has_name());
-  arg->set_name(attr.name());
-  CAFFE_ENFORCE(attr.has_type());
-  const auto type = attr.type();
-  if (type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_FLOAT) {
-    CAFFE_ENFORCE(attr.has_f());
-    arg->set_f(attr.f());
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::AttributeProto_AttributeType_INT) {
-    CAFFE_ENFORCE(attr.has_i());
-    arg->set_i(attr.i());
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_STRING) {
-    CAFFE_ENFORCE(attr.has_s());
-    arg->set_s(attr.s());
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_TENSOR) {
-    CAFFE_THROW("Caffe2's Argument does not support tensor as attribute.");
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_GRAPH) {
-    // TODO
-    CAFFE_THROW("GraphProto conversion is not implemented yet.");
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_FLOATS) {
-    arg->mutable_floats()->CopyFrom(attr.floats());
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_INTS) {
-    arg->mutable_ints()->CopyFrom(attr.ints());
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_STRINGS) {
-    arg->mutable_strings()->CopyFrom(attr.strings());
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_TENSORS) {
-    CAFFE_THROW("Caffe2's Argument does not support tensors as attribute.");
-  } else if (
-      type ==
-      ::torch::AttributeProto_AttributeType::
-          AttributeProto_AttributeType_GRAPHS) {
-    // TODO
-    CAFFE_THROW("GraphProtos conversion is not implemented yet.");
-  } else {
-    CAFFE_THROW("Unknow Attribute type.");
-  }
-}
-
-C10_EXPORT void OperatorDefToNodeProto(
-    const OperatorDef& def,
-    ::torch::NodeProto* node) {
-  node->mutable_input()->CopyFrom(def.input());
-  node->mutable_output()->CopyFrom(def.output());
-  if (def.has_name()) {
-    node->set_name(def.name());
-  }
-  CAFFE_ENFORCE(def.has_type());
-  node->set_op_type(def.type());
-  for (int i = 0; i < def.arg_size(); ++i) {
-    auto attr = node->add_attribute();
-    ArgumentToAttributeProto(def.arg(i), attr);
-  }
-  if (def.has_device_option()) {
-    node->mutable_device_option()->CopyFrom(def.device_option());
-  }
-  if (def.has_engine()) {
-    auto attr = node->add_annotations();
-    attr->set_name("engine");
-    attr->set_type(::torch::AttributeProto_AttributeType::
-                       AttributeProto_AttributeType_STRING);
-    attr->set_s(def.engine());
-  }
-  if (def.control_input_size() > 0) {
-    auto attr = node->add_annotations();
-    attr->set_name("control_input");
-    attr->set_type(::torch::AttributeProto_AttributeType::
-                       AttributeProto_AttributeType_STRINGS);
-    attr->mutable_strings()->CopyFrom(def.control_input());
-  }
-  if (def.has_is_gradient_op()) {
-    auto attr = node->add_annotations();
-    attr->set_name("is_gradient_op");
-    attr->set_type(::torch::AttributeProto_AttributeType::
-                       AttributeProto_AttributeType_INT);
-    if (def.is_gradient_op()) {
-      attr->set_i(1);
-    } else {
-      attr->set_i(0);
-    }
-  }
-  if (def.has_debug_info()) {
-    node->set_doc_string(def.debug_info());
-  }
-}
-
-C10_EXPORT void NodeProtoToOperatorDef(
-    const ::torch::NodeProto& node,
-    OperatorDef* def) {
-  def->mutable_input()->CopyFrom(node.input());
-  def->mutable_output()->CopyFrom(node.output());
-  if (node.has_name()) {
-    def->set_name(node.name());
-  }
-
-  CAFFE_ENFORCE(node.has_op_type());
-  def->set_type(node.op_type());
-  for (int i = 0; i < node.attribute_size(); ++i) {
-    auto arg = def->add_arg();
-    AttributeProtoToArgument(node.attribute(i), arg);
-  }
-  if (node.has_doc_string()) {
-    def->set_debug_info(node.doc_string());
-  }
-  for (int i = 0; i < node.annotations_size(); ++i) {
-    const auto& attr = node.annotations(i);
-    CAFFE_ENFORCE(attr.has_name());
-    if (attr.name() == "engine") {
-      CAFFE_ENFORCE(attr.has_s());
-      def->set_engine(attr.s());
-    } else if (attr.name() == "control_input") {
-      def->mutable_control_input()->CopyFrom(attr.strings());
-    } else if (attr.name() == "is_gradient_op") {
-      CAFFE_ENFORCE(attr.has_i());
-      if (i == 0) {
-        def->set_is_gradient_op(false);
-      } else {
-        def->set_is_gradient_op(true);
-      }
-    }
-    auto arg = def->add_arg();
-    AttributeProtoToArgument(node.annotations(i), arg);
-  }
-  if (node.has_device_option()) {
-    def->mutable_device_option()->CopyFrom(node.device_option());
-  }
-}
-
 } // namespace caffe2
diff --git a/caffe2/utils/proto_convert.h b/caffe2/utils/proto_convert.h
index a9ca9c3ad4fa41..91bcf1bafa2298 100644
--- a/caffe2/utils/proto_convert.h
+++ b/caffe2/utils/proto_convert.h
@@ -6,20 +6,6 @@
 #include "caffe2/proto/torch_pb.h"
 
 namespace caffe2 {
-
-CAFFE2_API void ArgumentToAttributeProto(
-    const Argument& arg,
-    ::torch::AttributeProto* attr);
-CAFFE2_API void AttributeProtoToArgument(
-    const ::torch::AttributeProto& attr,
-    Argument* arg);
-CAFFE2_API void OperatorDefToNodeProto(
-    const OperatorDef& def,
-    ::torch::NodeProto* node);
-CAFFE2_API void NodeProtoToOperatorDef(
-    const ::torch::NodeProto& node,
-    OperatorDef* def);
-
 } // namespace caffe2
 
 #endif // CAFFE2_UTILS_PROTO_CONVERT_H_

From 23f86ad57fb9858f06eca07dbb1000f68d59c25e Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 1 Oct 2018 16:45:06 -0700
Subject: [PATCH 75/82] Back out "[caffe2][mpscnn] Enable multiple external
 output"

Summary: Original commit changeset: 0cea9469cea0

Differential Revision: D10135814

fbshipit-source-id: 9563361cc00f4ce5dc2e903c0fcb10643ee9af26
---
 .../mobile/contrib/ios/mpscnn/mpscnn_graph.mm | 60 +++----------------
 1 file changed, 9 insertions(+), 51 deletions(-)

diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
index 843c9d89cccb68..b2945d58a1039e 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph.mm
@@ -46,34 +46,6 @@ Analysis analyzeNet(const NetDef& net) {
   return analysis;
 }
 
-static void rewriteInput(OperatorDef* op, int i) {
-  auto input = op->input(i);
-  op->set_input(i, input + "_I");
-}
-
-static void rewriteOutput(OperatorDef* op, int i) {
-  auto output = op->output(i);
-  op->set_output(i, output + "_M");
-}
-
-static void insertInputCopyToMPSCNNOp(
-    NetDef& predictNet,
-    const std::string& cpu_blob) {
-  auto* op = predictNet.add_op();
-  op->set_type("CopyToMPSCNN");
-  op->add_input(cpu_blob);
-  op->add_output(cpu_blob + "_I");
-}
-
-static void insertOutputCopyFromMPSCNNOp(
-    NetDef& predictNet,
-    const std::string& cpu_blob) {
-  auto* op = predictNet.add_op();
-  op->set_type("CopyFromMPSCNN");
-  op->add_input(cpu_blob + "_M");
-  op->add_output(cpu_blob);
-}
-
 NetDef insertInputOutputCopyOps(const NetDef& def) {
   // Do some validation of the outputs. For this version, we require:
   // - a single input (first element of external_input()) is consumed by the
@@ -110,8 +82,6 @@ NetDef insertInputOutputCopyOps(const NetDef& def) {
     op.add_output("__METAL_INPUT_COPY__");
   }
 
-  std::unordered_set<std::string> output_set;
-
   for (auto i = 0; i < def.op_size(); ++i) {
     const auto& ogOp = def.op(i);
     auto op = mdef.add_op();
@@ -120,29 +90,17 @@ NetDef insertInputOutputCopyOps(const NetDef& def) {
       CAFFE_ENFORCE_EQ(op->input(0), def.external_input(0));
       op->set_input(0, "__METAL_INPUT_COPY__");
     }
-    // rewrite input
-    for (auto j = 0; j < op->input_size(); ++j) {
-      if (output_set.find(op->input(j)) != output_set.end()) {
-        insertInputCopyToMPSCNNOp(mdef, op->input(j));
-        rewriteInput(op, j);
-      }
-    }
-
-    // if the output is in external output, copy from metal when necessary
-    for (auto j = 0; j < op->output_size(); ++j) {
-      for (auto k = 0; k < def.external_output_size(); ++k) {
-        // Assuming external output blob has unique name, e.g. only version 0
-        // of the blob is used as the output
-        if (op->output(j) == def.external_output(k)) {
-          output_set.insert(op->output(j));
-          insertOutputCopyFromMPSCNNOp(mdef, op->output(j));
-          // rewrite output to output_M for the operator
-          rewriteOutput(op, j);
-        }
-      }
+    if (i == def.op_size() - 1) {
+      CAFFE_ENFORCE_EQ(op->output(0), def.external_output(0));
+      op->set_output(0, "__METAL_OUTPUT_COPY__");
     }
   }
-
+  {
+    auto& op = *(mdef.add_op());
+    op.set_type("CopyFromMPSCNN");
+    op.add_input("__METAL_OUTPUT_COPY__");
+    op.add_output(def.external_output(0));
+  }
   return mdef;
 }
 

From 26df16eb21a9e1fdfdfe534e013593b914e2b55b Mon Sep 17 00:00:00 2001
From: Junjie Bai <jbai@fb.com>
Date: Mon, 1 Oct 2018 17:11:02 -0700
Subject: [PATCH 76/82] Clear previous device option when keep_device is set in
 load op

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12240

Reviewed By: jerryzh168

Differential Revision: D10133933

fbshipit-source-id: 05935bd527177f936c1d08626888d43dedbf5ce4
---
 caffe2/operators/load_save_op.cc              | 1 +
 caffe2/operators/load_save_op_gpu.cc          | 1 +
 caffe2/python/operator_test/load_save_test.py | 4 +++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/caffe2/operators/load_save_op.cc b/caffe2/operators/load_save_op.cc
index ffef2f8b39fb5b..50dcf5259a84eb 100644
--- a/caffe2/operators/load_save_op.cc
+++ b/caffe2/operators/load_save_op.cc
@@ -5,6 +5,7 @@ namespace caffe2 {
 template <>
 void LoadOp<CPUContext>::SetCurrentDevice(BlobProto* proto) {
   if (proto->has_tensor()) {
+    proto->mutable_tensor()->clear_device_detail();
     proto->mutable_tensor()->mutable_device_detail()->set_device_type(
         PROTO_CPU);
   }
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
index 8458fab901ed8b..f81b7789699c15 100644
--- a/caffe2/operators/load_save_op_gpu.cc
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -6,6 +6,7 @@ namespace caffe2 {
 template <>
 void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
   if (proto->has_tensor()) {
+    proto->mutable_tensor()->clear_device_detail();
     auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
     device_detail->set_device_type(PROTO_CUDA);
     device_detail->set_device_id(CaffeCudaGetDevice());
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index b90a7f84b4ed8a..8e3817034d435b 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -4,7 +4,7 @@
 from __future__ import unicode_literals
 import errno
 import hypothesis.strategies as st
-from hypothesis import given
+from hypothesis import given, assume
 import numpy as np
 import os
 import shutil
@@ -42,6 +42,8 @@ def load_save(self, src_device_type, src_gpu_id,
                   np.int16, np.int32, np.int64, np.uint8, np.uint16]
         arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
                   for T in dtypes]
+        assume(src_device_type == caffe2_pb2.CUDA or src_gpu_id == 0)
+        assume(dst_device_type == caffe2_pb2.CUDA or dst_gpu_id == 0)
         src_device_option = core.DeviceOption(
             src_device_type, src_gpu_id)
         dst_device_option = core.DeviceOption(

From ecace9eb217903cd7f15d97516b7ef93c9820e5b Mon Sep 17 00:00:00 2001
From: Shicong Zhao <zsc@fb.com>
Date: Mon, 1 Oct 2018 18:23:15 -0700
Subject: [PATCH 77/82] Move crf in caffe2 from fb to oss (#12200)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12200

moved crf_viterbi_op, copied crf_predict and crf_viterbi_test to oss

Reviewed By: Yangqing

Differential Revision: D10118341

fbshipit-source-id: 51e30e57d280d6ca75fc0b488f743794f23b589f
---
 caffe2/operators/crf_viterbi_op.cc | 221 +++++++++++++++++++++++++++++
 caffe2/python/crf_predict.py       |  33 +++++
 caffe2/python/crf_viterbi_test.py  |  45 ++++++
 3 files changed, 299 insertions(+)
 create mode 100644 caffe2/operators/crf_viterbi_op.cc
 create mode 100644 caffe2/python/crf_predict.py
 create mode 100644 caffe2/python/crf_viterbi_test.py

diff --git a/caffe2/operators/crf_viterbi_op.cc b/caffe2/operators/crf_viterbi_op.cc
new file mode 100644
index 00000000000000..39a5391d735fcd
--- /dev/null
+++ b/caffe2/operators/crf_viterbi_op.cc
@@ -0,0 +1,221 @@
+#include <algorithm>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+
+void RowwiseMaxAndArg(
+    const float* mat,
+    int32_t N,
+    int32_t D,
+    float* rowMax,
+    int32_t* argMax) {
+  auto eigenMat = ConstEigenMatrixMap<float>(mat, D, N);
+  for (auto i = 0; i < D; i++) {
+    // eigenMat.row(i) is equivalent to column i in mat
+    rowMax[i] = eigenMat.row(i).maxCoeff(argMax + i);
+  }
+}
+void ColwiseMaxAndArg(
+    const float* mat,
+    int32_t N,
+    int32_t D,
+    float* colMax,
+    int32_t* argMax) {
+  auto eigenMat = ConstEigenMatrixMap<float>(mat, D, N);
+  for (auto i = 0; i < N; i++) {
+    // eigenMat.col(i) is equivalent to row i in mat
+    colMax[i] = eigenMat.col(i).maxCoeff(argMax + i);
+  }
+}
+
+class ViterbiPathOp : public Operator<CPUContext> {
+ public:
+  ViterbiPathOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  void GatherRow(
+      const TensorCPU& data,
+      int32_t rowIndex,
+      int32_t block_size,
+      int32_t block_bytesize,
+      TensorCPU* outRow) {
+    CAFFE_ENFORCE(
+        0 <= rowIndex && rowIndex < data.dim(0),
+        "rowIndex is out of DATA bounds");
+    auto out = static_cast<char*>(outRow->raw_mutable_data(data.meta()));
+    auto src_base = static_cast<const char*>(data.raw_data());
+    auto src = src_base + rowIndex * block_bytesize;
+    context_.CopyItemsSameDevice(data.meta(), block_size, src, out);
+  }
+
+  void
+  AddColToMat(const TensorCPU& mat, const TensorCPU& col, TensorCPU* result) {
+    float* resultData = result->template mutable_data<float>();
+    const float* colData = col.template data<float>();
+    // Initialize the columns of the result to be = the input col
+    for (auto i = 0; i < result->dim32(1); i++) {
+      for (auto j = 0; j < result->dim32(0); j++) {
+        resultData[i * result->dim32(0) + j] = colData[i];
+      }
+    }
+    // Element-wise add of the result and the input matrix
+    math::Add<float, CPUContext>(
+        mat.size(),
+        resultData,
+        mat.template data<float>(),
+        resultData,
+        &context_);
+  }
+
+  bool RunOnDevice() override {
+    auto& predictions = Input(0);
+    auto& transitions = Input(1);
+    auto* viterbiPath = Output(0);
+
+    CAFFE_ENFORCE(
+        predictions.ndim() == 2 && transitions.ndim() == 2,
+        "Predictions and transitions hould 2D matrices");
+
+    CAFFE_ENFORCE(
+        predictions.dim(1) == transitions.dim(0),
+        "Predictions and transitions dimensions not matching");
+
+    auto seqLen = predictions.dim32(0);
+
+    viterbiPath->Resize(seqLen);
+    auto block_size = predictions.size() / predictions.dim(0);
+    auto block_bytesize =
+        predictions.size_from_dim(1) * predictions.meta().itemsize();
+    Tensor backpointers(CPU);
+    backpointers.ResizeLike(predictions);
+
+    Tensor trellis(std::vector<int64_t>{block_size}, CPU);
+    Tensor dpMat(CPU);
+    dpMat.ResizeLike(transitions);
+    Tensor dpMax(std::vector<int64_t>{block_size}, CPU);
+    GatherRow(predictions, 0, block_size, block_bytesize, &trellis);
+    for (auto i = 1; i < seqLen; i++) {
+      AddColToMat(transitions, trellis, &dpMat);
+      RowwiseMaxAndArg(
+          dpMat.template data<float>(),
+          dpMat.dim(0),
+          dpMat.dim(1),
+          dpMax.template mutable_data<float>(),
+          backpointers.template mutable_data<int32_t>() + (i * block_size));
+
+      GatherRow(predictions, i, block_size, block_bytesize, &trellis);
+      math::Add<float, CPUContext>(
+          trellis.size(),
+          trellis.template data<float>(),
+          dpMax.template data<float>(),
+          trellis.template mutable_data<float>(),
+          &context_);
+    }
+
+    Tensor tMax(std::vector<int64_t>{1}, CPU);
+    Tensor tArgMax(std::vector<int64_t>{1}, CPU);
+    ColwiseMaxAndArg(
+        trellis.template data<float>(),
+        1,
+        trellis.size(),
+        tMax.template mutable_data<float>(),
+        tArgMax.template mutable_data<int32_t>());
+
+    std::vector<int32_t> viterbiVec;
+    viterbiVec.push_back(tArgMax.template data<int32_t>()[0]);
+    Tensor bpEntry(std::vector<int64_t>{block_size}, CPU);
+    block_bytesize =
+        backpointers.size_from_dim(1) * backpointers.meta().itemsize();
+    for (auto i = seqLen - 1; i > 0; i--) {
+      GatherRow(backpointers, i, block_size, block_bytesize, &bpEntry);
+      viterbiVec.push_back(bpEntry.template data<int32_t>()[viterbiVec.back()]);
+    }
+    std::reverse_copy(
+        viterbiVec.begin(),
+        viterbiVec.end(),
+        viterbiPath->template mutable_data<int32_t>());
+    return true;
+  }
+};
+class SwapBestPathOp : public Operator<CPUContext> {
+ public:
+  SwapBestPathOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+  bool RunOnDevice() override {
+    auto& data = Input(0);
+    auto& newBestIdicies = Input(1);
+    auto* updatedData = Output(0);
+
+    CAFFE_ENFORCE(
+        data.ndim() == 2 && newBestIdicies.ndim() == 1,
+        "predictions should be a 2D matrix and  bestPath should be 1D vector");
+
+    CAFFE_ENFORCE(
+        data.dim(0) == newBestIdicies.dim(0),
+        "predictions and bestPath dimensions not matching");
+
+    updatedData->ResizeLike(data);
+    float* outData = updatedData->template mutable_data<float>();
+    context_.CopyItemsSameDevice(
+        data.meta(), data.size(), data.template data<float>(), outData);
+
+    Tensor bestScores(CPU);
+    bestScores.ResizeLike(newBestIdicies);
+    Tensor oldBestIndices(CPU);
+    oldBestIndices.ResizeLike(newBestIdicies);
+
+    ColwiseMaxAndArg(
+        data.template data<float>(),
+        data.dim(0),
+        data.dim(1),
+        bestScores.template mutable_data<float>(),
+        oldBestIndices.template mutable_data<int32_t>());
+
+    auto block_size = data.size() / data.dim(0);
+
+    const int32_t* oldBestIdx = oldBestIndices.template data<int32_t>();
+    const int32_t* newIdx = newBestIdicies.template data<int32_t>();
+
+    for (auto i = 0; i < data.dim32(0); i++) {
+      std::swap(
+          outData[i * block_size + newIdx[i]],
+          outData[i * block_size + oldBestIdx[i]]);
+    }
+    return true;
+  }
+};
+REGISTER_CPU_OPERATOR(ViterbiPath, ViterbiPathOp);
+OPERATOR_SCHEMA(ViterbiPath)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a predictions matrix and a transitions matrix, get the path with the best
+score
+)DOC")
+    .Input(0, "predictions", "N*D predictions matrix")
+    .Input(1, "transitions", "D*D transitions matrix")
+    .Output(0, "viterbi_path", "N*1 vector holds the best path indices");
+NO_GRADIENT(ViterbiPath);
+REGISTER_CPU_OPERATOR(SwapBestPath, SwapBestPathOp);
+OPERATOR_SCHEMA(SwapBestPath)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a sequence of idices and a matrix, enforce that these indices have the
+best columnwise scores
+score
+)DOC")
+    .Input(0, "predictions", "N*D predictions matrix")
+    .Input(1, "bestPath", "N*1 vector holds the best path indices ")
+    .Output(0, "new_predictions", "N*D updated predictions matrix");
+NO_GRADIENT(SwapBestPath);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py
new file mode 100644
index 00000000000000..dd1c8720bfb153
--- /dev/null
+++ b/caffe2/python/crf_predict.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import numpy as np
+from caffe2.python.crf import CRFWithLoss
+
+
+def crf_update_predictions(model, crf_with_loss, classes):
+    return apply_crf(
+        model.param_init_net,
+        model.net,
+        crf_with_loss.transitions,
+        classes,
+        crf_with_loss.num_classes,
+    )
+
+
+def apply_crf(init_net, net, transitions, predictions, num_classes):
+    padded_classes = CRFWithLoss.pad_predictions(
+        predictions, init_net, net, num_classes
+    )
+    bestPath = net.ViterbiPath([padded_classes, transitions])
+    new_padded_classes = net.SwapBestPath([padded_classes, bestPath])
+    # Revert the effect of pad_predictions by removing the last two rows and
+    # the last two columns
+    new_classes = net.RemovePadding(
+        [new_padded_classes], padding_width=1, end_padding_width=1
+    )
+    slice_starts = np.array([0, 0]).astype(np.int32)
+    slice_ends = np.array([-1, -3]).astype(np.int32)
+    slice_starts = net.GivenTensorIntFill([], shape=[2], values=slice_starts)
+    slice_ends = net.GivenTensorIntFill([], shape=[2], values=slice_ends)
+    new_classes = net.Slice([new_classes, slice_starts, slice_ends])
+    return new_classes
diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py
new file mode 100644
index 00000000000000..a4502d27e3e990
--- /dev/null
+++ b/caffe2/python/crf_viterbi_test.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import workspace, crf
+
+from caffe2.python.cnn import CNNModelHelper
+from caffe2.python.crf_predict import crf_update_predictions
+from caffe2.python.test_util import TestCase
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+
+
+class TestCrfDecode(TestCase):
+
+    @given(num_tags=st.integers(2, 4), num_words=st.integers(2, 15))
+    def test_crf_viterbi(self, num_tags, num_words):
+        model = CNNModelHelper(name='external')
+        predictions = np.random.randn(num_words, num_tags).astype(np.float32)
+        transitions = np.random.uniform(
+            low=-1, high=1, size=(num_tags + 2, num_tags + 2)
+        ).astype(np.float32)
+        predictions_blob, transitions_blob = (
+            model.net.AddExternalInputs('predictions', 'crf_transitions')
+        )
+        workspace.FeedBlob(str(transitions_blob), transitions)
+        workspace.FeedBlob(str(predictions_blob), predictions)
+        crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob)
+
+        updated_predictions = crf_update_predictions(
+            model, crf_layer, predictions_blob
+        )
+        ref_predictions = crf_layer.update_predictions(predictions_blob)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        updated_predictions = workspace.FetchBlob(str(updated_predictions))
+        ref_predictions = workspace.FetchBlob(str(ref_predictions))
+        np.testing.assert_allclose(
+            updated_predictions,
+            ref_predictions,
+            atol=1e-4, rtol=1e-4, err_msg='Mismatch in CRF predictions'
+        )

From 8af06d8114839ba555e685a850eab4d2e1b4c5e5 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Mon, 1 Oct 2018 18:23:26 -0700
Subject: [PATCH 78/82] Use DFS scheduling only within single device (#11848)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11848

Avoid crossing the boundary between devices when using DFS scheduling

Reviewed By: romain-intel

Differential Revision: D9931091

fbshipit-source-id: 1f3cf52127830048ed1db50b01677b66eeed8b32
---
 caffe2/core/net_async_base.cc       | 10 ++++++++++
 caffe2/core/net_async_base.h        |  2 ++
 caffe2/core/net_async_scheduling.cc | 17 ++++++++++++++---
 caffe2/core/net_async_scheduling.h  |  1 +
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index d2fad7cff10b1c..acc30e565170de 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -285,6 +285,16 @@ int AsyncNetBase::numOps(int task_id) const {
   return chains_[task_id].size();
 }
 
+const OperatorBase* AsyncNetBase::firstTaskOp(int task_id) const {
+  auto op_id = chains_[task_id].front();
+  return operator_nodes_[op_id].operator_.get();
+}
+
+const OperatorBase* AsyncNetBase::lastTaskOp(int task_id) const {
+  auto op_id = chains_[task_id].back();
+  return operator_nodes_[op_id].operator_.get();
+}
+
 void AsyncNetBase::asyncWait(
     int task_id,
     int stream_id,
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 6d76647535c1af..30948853dfb410 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -66,6 +66,8 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   int getParentCount(int child_id);
   bool testAndSetScheduled(int task_id);
   int numOps(int task_id) const;
+  const OperatorBase* firstTaskOp(int task_id) const;
+  const OperatorBase* lastTaskOp(int task_id) const;
 
   void asyncWait(
       int task_id,
diff --git a/caffe2/core/net_async_scheduling.cc b/caffe2/core/net_async_scheduling.cc
index 7feb3631abfd66..80d5807295f75a 100644
--- a/caffe2/core/net_async_scheduling.cc
+++ b/caffe2/core/net_async_scheduling.cc
@@ -35,6 +35,17 @@ void AsyncSchedulingNet::Wait() {
   }
 }
 
+bool AsyncSchedulingNet::isInlineTask(int parent_id, int child_id) const {
+  if (!use_dfs_scheduling_) {
+    return false;
+  }
+  const auto* last_parent_op = lastTaskOp(parent_id);
+  const auto* first_child_op = firstTaskOp(child_id);
+  // check that we do not cross device boundary
+  return IsSameDevice(
+      last_parent_op->device_option(), first_child_op->device_option());
+}
+
 void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
   if (!testAndSetScheduled(task_id)) {
     return;
@@ -63,7 +74,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
             canSchedule(child_id)) {
           // if DFS scheduling is enabled, run children inline,
           // ignore DFS scheduling in callbacks
-          schedule(child_id, use_dfs_scheduling_);
+          schedule(child_id, isInlineTask(task_id, child_id));
         } else {
           bool parent_failed = false;
           bool parent_needs_polling = false;
@@ -102,7 +113,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
           if (parent_failed) {
             // one of parents failed, set failure flag and wrap up execution
             success_ = false;
-            schedule(child_id, use_dfs_scheduling_);
+            schedule(child_id, isInlineTask(task_id, child_id));
           } else if (parent_needs_polling) {
             // some parents are blocking us from scheduling a child and don't
             // support callbacks, using polling
@@ -119,7 +130,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
             }
           } else {
             // we're ready to schedule a child
-            schedule(child_id, use_dfs_scheduling_);
+            schedule(child_id, isInlineTask(task_id, child_id));
           }
         }
       }
diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h
index 4fcdf4b7316818..69563c4f20b325 100644
--- a/caffe2/core/net_async_scheduling.h
+++ b/caffe2/core/net_async_scheduling.h
@@ -22,6 +22,7 @@ class CAFFE2_API AsyncSchedulingNet : public AsyncNetBase {
   void reset() override;
   virtual void finishRun();
   void parentCallback(int parent_id);
+  bool isInlineTask(int parent_id, int child_id) const;
 
   std::mutex running_mutex_;
   std::condition_variable running_cv_;

From 2cbcaf4544a295783f29c3c3bdb209f5272f5bf4 Mon Sep 17 00:00:00 2001
From: iotamudelta <dieterich@ogolem.org>
Date: Mon, 1 Oct 2018 18:28:10 -0700
Subject: [PATCH 79/82] Skip failing tests in test_sparse (#12229)

Summary:
Skip the recently introduced tests that fail on ROCm
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12229

Differential Revision: D10138146

Pulled By: bddppq

fbshipit-source-id: a0f1ff97fabb71f635a468e8030dbe32d388de49
---
 test/test_sparse.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index 831f0f746ae312..a91681d4767049 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -496,6 +496,7 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
         test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
+    @skipIfRocm
     def test_Sparse_to_Sparse_copy_(self):
         # This is for testing torch.copy_(SparseTensor, SparseTensor)
         sparse_dims = 3
@@ -535,6 +536,7 @@ def test_Sparse_to_Sparse_copy_(self):
         self.assertEqual(None, x1.grad)
 
     @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    @skipIfRocm
     def test_Sparse_to_Sparse_copy_multi_gpu(self):
         # This is for testing torch.copy_(SparseTensor, SparseTensor) across GPU devices
         sparse_dims = 3

From 696498d9e4bfecd2819b5d2abad42d9a11acad19 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 1 Oct 2018 21:24:22 -0700
Subject: [PATCH 80/82] Delete stride updating logic from Caffe2, and make
 PyTorch error in this case. (#12236)

Summary:
Strides appear to cause a huge memory regression in some of our internal
training workflows. This diff stems the bleeding, while we figure out exactly
what happened.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/12236

Reviewed By: dzhulgakov

Differential Revision: D10134319

fbshipit-source-id: 1547c89a65c05473c409c0977c19c99dcaefb89c
---
 aten/src/ATen/core/TensorImpl.cpp | 10 ++++++++++
 aten/src/ATen/core/TensorImpl.h   | 13 +++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
index 5b568482d8dfe2..d8f38e98ef4434 100644
--- a/aten/src/ATen/core/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -45,6 +45,9 @@ IntList TensorImpl::sizes() const {
 }
 
 IntList TensorImpl::strides() const {
+  AT_ASSERTM(strides_.size() == sizes_.size(),
+             "Caffe2 tensors don't (yet) have meaningful strides and cannot "
+             "be used in PyTorch.");
   return strides_;
 }
 
@@ -52,6 +55,10 @@ bool TensorImpl::compute_contiguous() const {
   bool is_contiguous = true;
   if (is_empty())
     return is_contiguous;
+  if (strides_.empty()) {
+    // Special case for Caffe2 tensors which don't have strides set.
+    return true;
+  }
   int64_t z = 1;
   for (int64_t d = dim() - 1; d >= 0; d--) {
     if (size(d) != 1) {
@@ -82,6 +89,9 @@ int64_t TensorImpl::size(int64_t d) const {
 }
 
 int64_t TensorImpl::stride(int64_t d) const {
+  AT_ASSERTM(strides_.size() == sizes_.size(),
+             "Caffe2 tensors don't (yet) have meaningful strides and cannot "
+             "be used in PyTorch.");
   d = at::maybe_wrap_dim(d, dim(), false);
   return strides_[d];
 }
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index f899c7ec1d1446..7d7ce6a980249c 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -282,13 +282,13 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   virtual void set_size(int64_t dim, int64_t new_size) {
-    sizes_[dim] = new_size;
+    sizes_.at(dim) = new_size;
     refresh_numel();
     refresh_contiguous();
   }
 
   virtual void set_stride(int64_t dim, int64_t new_stride) {
-    strides_[dim] = new_stride;
+    strides_.at(dim) = new_stride;
     refresh_numel();
     refresh_contiguous();
   }
@@ -864,14 +864,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   inline void update_to_contiguous_strides() {
-    strides_.resize(sizes_.size());
-    if (dim() > 0) {
-      int last_idx = dim() - 1;
-      strides_[last_idx] = 1;
-      for (auto i = last_idx - 1; i >= 0; --i) {
-        strides_[i] = strides_[i + 1] * std::max<int64_t>(sizes_[i + 1], 1);
-      }
-    }
+    strides_.resize(0);
     is_contiguous_ = true;
   }
 

From ff608a9ff3edded33764c8631427e92c7288bafb Mon Sep 17 00:00:00 2001
From: Junjie Bai <jbai@fb.com>
Date: Mon, 1 Oct 2018 21:44:08 -0700
Subject: [PATCH 81/82] Back out "Revert D10123245: Back out "codemod
 cuda_gpu_id to device_id"" (#12232)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12232

Original commit changeset: fca91fea58b7

This adds proper modifications to the DeviceType <->DeviceOption conversion code added in D10033396

Reviewed By: jerryzh168

Differential Revision: D10132473

fbshipit-source-id: 801ef777e2950982cb47b48051b1471a0a91e64b
---
 caffe2/contrib/nccl/cuda_nccl_op_gpu.cc       |  4 +-
 caffe2/contrib/nccl/nccl_ops_test.py          |  2 +-
 caffe2/contrib/prof/prof_dag_net.cc           |  4 +-
 .../tensorboard/tensorboard_exporter.py       |  2 +-
 caffe2/contrib/warpctc/ctc_ops_test.py        |  8 +-
 caffe2/core/blob_gpu_test.cc                  |  4 +-
 caffe2/core/context_gpu.cu                    |  2 +-
 caffe2/core/context_gpu.h                     |  6 +-
 caffe2/core/cudnn_wrappers.h                  |  6 +-
 caffe2/core/event_gpu.cc                      | 16 ++--
 caffe2/core/hip/event_hip.cc                  |  2 +-
 caffe2/core/memonger.cc                       |  4 +-
 caffe2/core/net_async_base.cc                 |  4 +-
 caffe2/core/net_async_dag_gpu.cc              |  2 +-
 caffe2/core/net_gpu_test.cc                   |  2 +-
 caffe2/core/operator.cc                       |  2 +-
 caffe2/mkl/utils/mkl_memory.cc                |  2 +-
 caffe2/observers/profile_observer_gpu.cc      |  4 +-
 caffe2/onnx/backend.cc                        |  2 +-
 caffe2/operators/load_save_op_gpu.cc          |  2 +-
 .../rnn/recurrent_network_executor_gpu.cc     |  4 +-
 caffe2/proto/caffe2.proto                     |  2 +-
 caffe2/proto/caffe2_pb.h                      | 46 ++++++++++-
 caffe2/python/cnn.py                          |  2 +-
 caffe2/python/core.py                         | 16 ++--
 caffe2/python/core_test.py                    | 82 +++++++++----------
 caffe2/python/data_parallel_model.py          |  6 +-
 caffe2/python/hypothesis_test_util.py         |  2 +-
 caffe2/python/model_helper.py                 |  4 +-
 caffe2/python/muji.py                         |  2 +-
 caffe2/python/net_printer.py                  |  4 +-
 caffe2/python/numa_test.py                    |  2 +-
 caffe2/python/onnx/backend_rep.py             |  2 +-
 caffe2/python/operator_test/load_save_test.py |  2 +-
 caffe2/python/operator_test/rnn_cell_test.py  |  2 +-
 caffe2/python/optimizer.py                    | 10 +--
 .../predictor/predictor_exporter_test.py      |  2 +-
 caffe2/python/pybind_state_dlpack.h           |  4 +-
 caffe2/utils/proto_utils.cc                   |  4 +-
 caffe2/utils/proto_utils_test.cc              |  4 +-
 .../pyHIPIFY/cuda_to_hip_mappings.py          |  2 +-
 41 files changed, 163 insertions(+), 121 deletions(-)

diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
index 4c5313ff4b3032..ea8b3494c6a036 100644
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@@ -11,7 +11,7 @@ nccl::NCCLExecution getNCCLElements(
   // We either do an N-N op, or an N-1 op.
   CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
   nccl::NCCLExecution ex;
-  ex.stream_gpu_id = context.device_id();
+  ex.stream_gpu_id = context.cuda_gpu_id();
   ex.stream = context.cuda_stream();
   ex.root = op->template GetSingleArgument<int>("root", 0);
   ex.elements.resize(op->InputSize());
@@ -204,7 +204,7 @@ std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
   for (int i = 0; i < def.input().size(); ++i) {
     DeviceOption dev;
     dev.set_device_type(1);
-    dev.set_device_id(i);
+    dev.set_cuda_gpu_id(i);
     opt.push_back(dev);
   }
   return std::make_pair(opt, opt);
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
index f6c22a7d750127..7e8a61e9de241d 100644
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -21,7 +21,7 @@
 def gpu_device(i):
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.device_id = i
+    device_option.cuda_gpu_id = i
     return device_option
 
 
diff --git a/caffe2/contrib/prof/prof_dag_net.cc b/caffe2/contrib/prof/prof_dag_net.cc
index c8678652c3138f..16917ddc154fc9 100644
--- a/caffe2/contrib/prof/prof_dag_net.cc
+++ b/caffe2/contrib/prof/prof_dag_net.cc
@@ -33,9 +33,9 @@ void ProfDAGNet::ValidateOpTensorDevices() {
       had_mismatches = true;
       LOG(INFO) << "== PERFORMANCE WARNING == \n"
                 << " Operator " << node.operator_->debug_def().type()
-                << " expects GPU " << mismatch.second.first.device_id()
+                << " expects GPU " << mismatch.second.first.cuda_gpu_id()
                 << " but tensor [" << mismatch.first << "] is on GPU "
-                << mismatch.second.second.device_id();
+                << mismatch.second.second.cuda_gpu_id();
     }
   }
   if (!had_mismatches) {
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index cc2c3d85c96877..93ade48e7d267d 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -177,7 +177,7 @@ def _tf_device(device_option):
     if device_option.device_type == caffe2_pb2.CPU:
         return "/cpu:*"
     if device_option.device_type == caffe2_pb2.CUDA:
-        return "/gpu:{}".format(device_option.device_id)
+        return "/gpu:{}".format(device_option.cuda_gpu_id)
     raise Exception("Unhandled device", device_option)
 
 
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 3b21c8b667473c..25bb0a39e3a965 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -79,11 +79,11 @@ def test_ctc_cost_cpu(self):
     def test_ctc_cost_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=False)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=False,
             skip_input_lengths=True)
 
@@ -99,10 +99,10 @@ def test_ctc_forward_only_cpu(self):
     def test_ctc_forward_only_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=True)
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
+                                    cuda_gpu_id=0),
             is_test=True,
             skip_input_lengths=True)
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 8b4127e403a452..55eafdede7269a 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -195,7 +195,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     }
     EXPECT_TRUE(tensor_proto.has_device_detail());
     EXPECT_EQ(tensor_proto.device_detail().device_type(), PROTO_CUDA);
-    EXPECT_EQ(tensor_proto.device_detail().device_id(), gpu_id);
+    EXPECT_EQ(tensor_proto.device_detail().cuda_gpu_id(), gpu_id);
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
@@ -205,7 +205,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if we force the restored blob on a different device, we
     // can still get so.
     blob.Reset();
-    proto.mutable_tensor()->mutable_device_detail()->set_device_id(0);
+    proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
     EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index f10fe067ac746c..0d9e2686212a1e 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -256,7 +256,7 @@ CUDAContext::CUDAContext(const int gpu_id)
 
 CUDAContext::CUDAContext(const DeviceOption& option)
     : gpu_id_(
-          option.has_device_id() ? RectifyGPUID(option.device_id())
+          option.has_cuda_gpu_id() ? RectifyGPUID(option.cuda_gpu_id())
                                    : CaffeCudaGetDevice()),
       random_seed_(
           option.has_random_seed() ? option.random_seed()
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 65ba4a006a94af..ce73f5f942828b 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -184,7 +184,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     }
   }
 
-  inline int device_id() const {
+  inline int cuda_gpu_id() const {
     return gpu_id_;
   }
 
@@ -283,7 +283,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   }
 
   static bool IsStreamFree(const DeviceOption& option, int stream_id) {
-    auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
+    auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
@@ -393,7 +393,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
     device->set_device_type(TypeToProto(GetDeviceType()));
-    device->set_device_id(GetGPUIDForPointer(data));
+    device->set_cuda_gpu_id(GetGPUIDForPointer(data));
   }
 
  protected:
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index dea138e9ad507c..1bd39fa62a399f 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -122,9 +122,9 @@ class CuDNNWrapper {
   void with_cudnn_state(size_t state_idx, F&& f) {
     CAFFE_ENFORCE(
         state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
-    auto& sync_state = cudnn_states()[context_->device_id()][state_idx];
+    auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
 
-    DeviceGuard dg(context_->device_id());
+    DeviceGuard dg(context_->cuda_gpu_id());
 
     // We need to serialize execution on the CuDNNState as we can't
     // allow multiple threads to race through the cudaEventRecord
@@ -132,7 +132,7 @@ class CuDNNWrapper {
     // execution)
     std::lock_guard<std::mutex> g(sync_state.mutex);
     if (!sync_state.state.get()) {
-      sync_state.state.reset(new CuDNNState(context_->device_id()));
+      sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
     }
     CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
   }
diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
index 44aec8d3f2b8f4..6253ca19c9ab70 100644
--- a/caffe2/core/event_gpu.cc
+++ b/caffe2/core/event_gpu.cc
@@ -9,21 +9,21 @@ namespace caffe2 {
 struct CudaEventWrapper {
   explicit CudaEventWrapper(const DeviceOption& option)
       : cuda_stream_(nullptr),
-        device_id_(option.device_id()),
+        cuda_gpu_id_(option.cuda_gpu_id()),
         status_(EventStatus::EVENT_INITIALIZED) {
     CAFFE_ENFORCE(option.device_type(), PROTO_CUDA);
-    DeviceGuard g(device_id_);
+    DeviceGuard g(cuda_gpu_id_);
     CUDA_ENFORCE(cudaEventCreate(
         &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
   }
   ~CudaEventWrapper() {
-    DeviceGuard g(device_id_);
+    DeviceGuard g(cuda_gpu_id_);
     CUDA_CHECK(cudaEventDestroy(cuda_event_));
   }
 
   cudaEvent_t cuda_event_;
   cudaStream_t cuda_stream_;
-  int device_id_;
+  int cuda_gpu_id_;
 
   std::atomic<int> status_;
   std::mutex mutex_recorded_;
@@ -65,12 +65,12 @@ void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
       const auto& current_device = CaffeCudaGetDevice();
       CAFFE_ENFORCE_EQ(
           current_device,
-          wrapper->device_id_,
+          wrapper->cuda_gpu_id_,
           "When you call EventRecordCUDA, your current device should be the same "
           "as the device specified by the event.");
       CAFFE_ENFORCE_EQ(
           current_device,
-          static_cast<const CUDAContext*>(context)->device_id());
+          static_cast<const CUDAContext*>(context)->cuda_gpu_id());
       CUDA_ENFORCE(cudaEventRecord(
           wrapper->cuda_event_,
           static_cast<const CUDAContext*>(context)->cuda_stream()));
@@ -96,7 +96,7 @@ void EventFinishCUDA(const Event* event) {
 
   if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
     // ok, even if event is already completed and status was not yet updated
-    DeviceGuard g(wrapper->device_id_);
+    DeviceGuard g(wrapper->cuda_gpu_id_);
     auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
     if (cudaResult == cudaSuccess) {
       wrapper->status_ = EventStatus::EVENT_SUCCESS;
@@ -127,7 +127,7 @@ void EventWaitCUDACUDA(const Event* event, void* context) {
     if (context_stream != event_stream) {
       // CAFFE_ENFORCE_EQ(
       //    CaffeCudaGetDevice(),
-      //    static_cast<const CUDAContext*>(context)->device_id());
+      //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
       CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
     }
   }
diff --git a/caffe2/core/hip/event_hip.cc b/caffe2/core/hip/event_hip.cc
index ebec9c593e6eee..6f0db4642ddbba 100644
--- a/caffe2/core/hip/event_hip.cc
+++ b/caffe2/core/hip/event_hip.cc
@@ -138,7 +138,7 @@ void EventWaitHIPHIP(const Event* event, void* context)
         {
             // CAFFE_ENFORCE_EQ(
             //    CaffeCudaGetDevice(),
-            //    static_cast<const CUDAContext*>(context)->device_id());
+            //    static_cast<const CUDAContext*>(context)->cuda_gpu_id());
             HIP_CHECK(hipStreamWaitEvent(context_stream, wrapper->hip_event_, 0));
         }
     }
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index 87633fadebe34e..d9816e787ba88c 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -176,7 +176,7 @@ class ComputeBlobRecyclingForDag {
         // cuda device option but whose inputs/outputs are on CPU
         if (net.op(op_index).type() == "CopyGPUToCPU") {
           blob_device_[output].set_device_type(0);
-          blob_device_[output].set_device_id(0);
+          blob_device_[output].set_cuda_gpu_id(0);
         }
       }
     }
@@ -478,7 +478,7 @@ class ComputeBlobRecyclingForDag {
       const DeviceOption& device_option) {
     const DeviceOption& blob_device = blob_device_[blob_name];
     if (device_option.device_type() != blob_device.device_type() ||
-        device_option.device_id() != blob_device.device_id()) {
+        device_option.cuda_gpu_id() != blob_device.cuda_gpu_id()) {
       return false;
     }
     for (const int token : req_tokens_[blob_name]) {
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index acc30e565170de..fe4b57cd3326d4 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -157,7 +157,7 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
         numa_node_id);
     return poolGetter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
   } else if (device_option.device_type() == PROTO_CUDA) {
-    auto gpu_id = device_option.device_id();
+    auto gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
         "Invalid GPU id: " + caffe2::to_string(gpu_id));
@@ -173,7 +173,7 @@ int AsyncNetBase::stream(int task_id) {
   const auto& device_option = event(task_id).GetDeviceOption();
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.device_id();
+    int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= getStreamCounters().size()) {
       getStreamCounters().resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 86d0b4d1d271dc..550a760826edd8 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -112,7 +112,7 @@ AsyncDAGNet::AsyncDAGNet(
 int AsyncDAGNet::stream(const DeviceOption& device_option) {
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
-    int gpu_id = device_option.device_id();
+    int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
     if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
diff --git a/caffe2/core/net_gpu_test.cc b/caffe2/core/net_gpu_test.cc
index fab56112ec227c..eaea9377f9bcac 100644
--- a/caffe2/core/net_gpu_test.cc
+++ b/caffe2/core/net_gpu_test.cc
@@ -124,7 +124,7 @@ TEST(NetTest, DISABLED_ChainingForDifferentDevices) {
           type: "NetTestDummy"
           device_option {
             device_type: 1
-            device_id: 1
+            cuda_gpu_id: 1
           }
         }
 )DOC";
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 8115ae3aab6a3c..79be08c03b2325 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -649,7 +649,7 @@ std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
           &blob_device);
 
       if (blob_device.device_type() == PROTO_CUDA &&
-          blob_device.device_id() != op_device.device_id()) {
+          blob_device.cuda_gpu_id() != op_device.cuda_gpu_id()) {
         mismatches[blob_name] = std::make_pair(op_device, blob_device);
       } else if (
           blob_device.device_type() == PROTO_HIP &&
diff --git a/caffe2/mkl/utils/mkl_memory.cc b/caffe2/mkl/utils/mkl_memory.cc
index 9d4f347a13cb81..3f05f9c5d24bde 100644
--- a/caffe2/mkl/utils/mkl_memory.cc
+++ b/caffe2/mkl/utils/mkl_memory.cc
@@ -26,7 +26,7 @@ static vector<int64_t> GetMKLTensorInfo(
   const mkl::MKLMemory<T>* tc = static_cast<const mkl::MKLMemory<T>*>(c);
   *capacity = tc->size() * sizeof(T);
   device->set_device_type(PROTO_MKLDNN);
-  device->set_device_id(0);
+  device->set_cuda_gpu_id(0);
   return tc->dims();
 }
 
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index 5bd9b0a11b0921..bf4e20b7904711 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -70,7 +70,7 @@ void ProfileOperatorObserver::Start() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->device_id());
+    cudaSetDevice(context->cuda_gpu_id());
     cudaEventCreate(&start_);
     cudaEventRecord(start_, context->cuda_stream());
 
@@ -92,7 +92,7 @@ void ProfileOperatorObserver::Stop() {
     int device;
     cudaGetDevice(&device);
 
-    cudaSetDevice(context->device_id());
+    cudaSetDevice(context->cuda_gpu_id());
     cudaEventCreate(&stop_);
     cudaEventRecord(stop_, context->cuda_stream());
     cudaEventSynchronize(stop_);
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 8a21fa0acf679c..2350910febff27 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -65,7 +65,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
       {DeviceType::CUDA, caffe2::DeviceType::CUDA}};
   caffe2::DeviceOption d;
   d.set_device_type(static_cast<int32_t>(m.at(onnx_device.type)));
-  d.set_device_id(onnx_device.device_id);
+  d.set_cuda_gpu_id(onnx_device.device_id);
   return d;
 }
 
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
index f81b7789699c15..eaa90b3dcdbc13 100644
--- a/caffe2/operators/load_save_op_gpu.cc
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -9,7 +9,7 @@ void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
     proto->mutable_tensor()->clear_device_detail();
     auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
     device_detail->set_device_type(PROTO_CUDA);
-    device_detail->set_device_id(CaffeCudaGetDevice());
+    device_detail->set_cuda_gpu_id(CaffeCudaGetDevice());
   }
 }
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
index 061f54d3a4cb0e..e16e2073f7fd12 100644
--- a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
@@ -72,11 +72,11 @@ void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
       if (gpu_id == -1 &&
           rnn_op.op->device_option().device_type() ==
               DeviceTypeProto::PROTO_CUDA) {
-        gpu_id = rnn_op.op->device_option().device_id();
+        gpu_id = rnn_op.op->device_option().cuda_gpu_id();
       } else {
         CAFFE_ENFORCE(
             rnn_op.op->device_option().device_type() == 0 ||
-                rnn_op.op->device_option().device_id() == gpu_id,
+                rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
             "RNN Executor only supports ops on one GPU");
       }
 
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 63a2a256ded1ea..9dc745edbdf0d9 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -183,7 +183,7 @@ message DeviceOption {
   // optional DeviceType device_type = 1 [ default = CPU ];
   optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
   // [CUDA specific] the cuda gpu id.
-  optional int32 device_id = 2;
+  optional int32 cuda_gpu_id = 2;
   // [general] The random seed to start the device random number generator with.
   optional uint32 random_seed = 3;
   // [general] What node this op should execute on.
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index e0eb8e8dcdcdf3..ded59d52b21f47 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -86,12 +86,54 @@ inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
   caffe2::DeviceOption option;
   auto type = device.type();
   option.set_device_type(TypeToProto(type));
-  option.set_device_id(device.index());
+
+  switch (type) {
+    case DeviceType::CPU:
+      if (device.index() != -1) {
+        option.set_numa_node_id(device.index());
+      }
+      break;
+    case DeviceType::CUDA:
+      option.set_cuda_gpu_id(device.index());
+      break;
+    case DeviceType::HIP:
+      option.set_hip_gpu_id(device.index());
+      break;
+    case DeviceType::OPENGL:
+    case DeviceType::OPENCL:
+    case DeviceType::MKLDNN:
+    case DeviceType::IDEEP:
+    case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
+    case DeviceType::ONLY_FOR_TEST:
+      break;
+    default:
+      AT_ERROR(
+          "Unknown device:",
+          static_cast<int32_t>(type),
+          ". If you have recently updated the caffe2.proto file to add a new "
+          "device type, did you forget to update the ProtoToType() and TypeToProto"
+          "function to reflect such recent changes?");
+  }
   return option;
 }
 
 inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
-  return at::Device(ProtoToType(option.device_type()), option.device_id());
+  auto type = option.device_type();
+  int32_t id = -1;
+  switch (type) {
+    case caffe2::PROTO_CPU:
+      if (option.has_numa_node_id()) {
+        id = option.numa_node_id();
+      }
+      break;
+    case caffe2::PROTO_CUDA:
+      id = option.cuda_gpu_id();
+      break;
+    case caffe2::PROTO_HIP:
+      id = option.hip_gpu_id();
+      break;
+  }
+  return at::Device(ProtoToType(type), id);
 }
 
 } // namespace caffe2
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index f9ccf92d75099b..f927020e6ae88f 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -236,5 +236,5 @@ def CPU(self):
     def GPU(self, gpu_id=0):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = gpu_id
+        device_option.cuda_gpu_id = gpu_id
         return device_option
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 4f683daa368240..6850c02fc13964 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -84,7 +84,7 @@ def IsOperatorWithEngine(op_type, engine):
 
 def DeviceOption(
     device_type,
-    device_id=0,
+    cuda_gpu_id=0,
     random_seed=None,
     node_name=None,
     numa_node_id=None,
@@ -92,7 +92,7 @@ def DeviceOption(
 ):
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
-    option.device_id = device_id
+    option.cuda_gpu_id = cuda_gpu_id
     if node_name is not None:
         option.node_name = node_name
     if random_seed is not None:
@@ -115,7 +115,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr
     if not opt1.device_type or not opt2.device_type:
         # At least one option is for CPU, check if both are for CPU.
         return not opt1.device_type and not opt2.device_type
-    return opt1.device_id == opt2.device_id
+    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
 
 
 def InferBlobDevices(net):
@@ -2111,7 +2111,7 @@ def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = gpu_id
+        device_option.cuda_gpu_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
@@ -2286,7 +2286,7 @@ def copy_func_between_devices(src, dst):
         return None
 
     if src.device_type == CUDA and dst.device_type == CUDA:
-        if src.device_id == dst.device_id:
+        if src.cuda_gpu_id == dst.cuda_gpu_id:
             return None
         else:
             def fun(net, *args, **kw):
@@ -2312,10 +2312,10 @@ def fun(net, *args, **kw):
 def device_equal(src, dst):
     '''
     We are using this fucntion instead of == operator because optional-value
-    comparison between empty device_options and {device_type:0, device_id:0}
+    comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
     returns not equal in some cases.
     '''
-    return src.device_type == dst.device_type and src.device_id == dst.device_id
+    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
 
 
 def update_placeholder_op_output(op, blob_to_device):
@@ -2429,7 +2429,7 @@ def _gen_new_name(blob, device_option):
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
                         elif device_option.device_type == CUDA:
-                            suffix = '_cuda_' + str(device_option.device_id)
+                            suffix = '_cuda_' + str(device_option.cuda_gpu_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 2f6dedbfd80c83..7120843f33152d 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -83,17 +83,17 @@ def testDeviceScope(self):
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
             self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-            self.assertEqual(op.device_option.device_id, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
             override_device.device_type = caffe2_pb2.CPU
@@ -109,13 +109,13 @@ def testDeviceScope(self):
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
                 self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-                self.assertEqual(op.device_option.device_id, 1)
+                self.assertEqual(op.device_option.cuda_gpu_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
                 self.assertEqual(len(op.output), 1)
@@ -255,7 +255,7 @@ class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
             control_input="z", device_option=device_option,
@@ -271,7 +271,7 @@ def testCreate(self):
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertTrue(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
@@ -574,7 +574,7 @@ def test_check_equal_default_value(self):
         opt2 = caffe2_pb2.DeviceOption()
         opt1.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.device_id = 5
+        opt1.cuda_gpu_id = 5
         # opt1 still is on CPU, so the options should be equal
         self.assertTrue(core.device_option_equal(opt1, opt2))
         opt2.device_type = 0
@@ -649,7 +649,7 @@ class TestInferDevice(test_util.TestCase):
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         self.cuda_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
@@ -748,7 +748,7 @@ def test_inject_copy(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
 
@@ -765,7 +765,7 @@ def test_inject_copy(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
         self.assertNotEqual(blob_to_device["fc_w"], device_option)
@@ -775,7 +775,7 @@ def test_cross_nets(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
         const = init_net.ConstantFill([], 'const', shape=[], value=1.)
@@ -791,12 +791,12 @@ def test_cross_nets(self):
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "fc_w_cuda_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "fc_b_cuda_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
@@ -804,7 +804,7 @@ def test_cross_nets(self):
         self.assertEqual(op.input[1], "fc_w_cuda_1")
         self.assertEqual(op.input[2], "fc_b_cuda_1")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
@@ -822,7 +822,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -832,7 +832,7 @@ def test_cross_nets(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -844,7 +844,7 @@ def test_cross_nets(self):
   type: "FC"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -855,7 +855,7 @@ def test_cross_nets(self):
   type: "Add"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -870,7 +870,7 @@ def test_cross_nets_no_change(self):
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
@@ -887,7 +887,7 @@ def test_cross_nets_no_change(self):
         self.assertEqual(op.input[1], "fc_w")
         self.assertEqual(op.input[2], "fc_b")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         """
 For reference, net.Proto() should be like:
 name: ""
@@ -900,7 +900,7 @@ def test_cross_nets_no_change(self):
   type: "FC"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -912,7 +912,7 @@ def test_inject_copy_multi_use(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             net.Relu("data", "relu1")
@@ -920,10 +920,10 @@ def test_inject_copy_multi_use(self):
         with core.DeviceScope(device_option):
             net.Relu("data", "relu3")
         net.Relu("data", "relu4")
-        device_option.device_id = 0
+        device_option.cuda_gpu_id = 0
         with core.DeviceScope(device_option):
             net.Relu("data", "relu5")
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             net.Relu("data", "relu6")
 
@@ -931,12 +931,12 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "data_cuda_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
         self.assertEqual(op.type, "Relu")
@@ -945,7 +945,7 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
@@ -955,18 +955,18 @@ def test_inject_copy_multi_use(self):
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "data_cuda_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.input[0], "data_cuda_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 1)
+        self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu6")
         """
@@ -979,7 +979,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -989,7 +989,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -1005,7 +1005,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 op {
@@ -1021,7 +1021,7 @@ def test_inject_copy_multi_use(self):
   type: "CopyCPUToGPU"
   device_option {
     device_type: 1
-    device_id: 0
+    cuda_gpu_id: 0
   }
 }
 op {
@@ -1031,7 +1031,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 0
+    cuda_gpu_id: 0
   }
 }
 op {
@@ -1041,7 +1041,7 @@ def test_inject_copy_multi_use(self):
   type: "Relu"
   device_option {
     device_type: 1
-    device_id: 1
+    cuda_gpu_id: 1
   }
 }
 external_input: "data"
@@ -1060,7 +1060,7 @@ def test_inject_copy_placeholder_ops(self):
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
             gpu_device[i].device_type = caffe2_pb2.CUDA
-            gpu_device[i].device_id = 0
+            gpu_device[i].cuda_gpu_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
         recv_node = 'node:1'
@@ -1100,12 +1100,12 @@ def test_inject_copy_placeholder_ops(self):
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
         self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.device_id, 0)
+        self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
         self.assertEqual(op.type, placeholder_send)
@@ -1128,7 +1128,7 @@ def test_blob_inplace(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
         device_option.device_type = caffe2_pb2.CUDA
-        device_option.device_id = 1
+        device_option.cuda_gpu_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
         with core.DeviceScope(device_option):
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 749c8b12c930e8..89770dc6ea7d9a 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -813,7 +813,7 @@ def builder_fun(model):
 
     device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
 
-    namescope = "{}_{}/".format(device_prefix, device.device_id)
+    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
     for op in mnet.Proto().op:
         if "RecurrentNetwork" in op.type:
             raise("RecurrentNetwork conversion not yet supported")
@@ -1540,7 +1540,7 @@ def _AnalyzeOperators(model):
             continue
 
         op_dev = op.device_option
-        op_gpu = op_dev.device_id
+        op_gpu = op_dev.cuda_gpu_id
 
         # This avoids failing on operators that are only for CPU
         if op_dev.device_type != caffe2_pb2.CUDA:
@@ -1904,7 +1904,7 @@ def _InterleaveOps(model):
     new_ops = []
     ops = {d: [] for d in range(num_devices)}
     for op in orig_ops:
-        ops[op.device_option.device_id].append(op)
+        ops[op.device_option.cuda_gpu_id].append(op)
 
     for j in range(num_ops_per_dev):
         tp = None
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 8470df1588717f..5cc18f99bd9eb9 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -259,7 +259,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
-    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, device_id=i)
+    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
      for i in range(workspace.NumCudaDevices())]
     if workspace.has_gpu_support else [])
 
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index 1e881d27f49dc8..f8e3f32bb2c225 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -596,7 +596,7 @@ def rename_list(proto_list):
                             rename_list(step_op.output)
                             if device is not None:
                                 step_op.device_option.device_type = device.device_type
-                                step_op.device_option.device_id = device.device_id
+                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
 
                         rename_list(arg.n.external_input)
                         rename_list(arg.n.external_output)
@@ -610,7 +610,7 @@ def rename_list(proto_list):
 
             if device is not None:
                 op.device_option.device_type = device.device_type
-                op.device_option.device_id = device.device_id
+                op.device_option.cuda_gpu_id = device.cuda_gpu_id
             validate_op(op)
             predict_proto.op.extend([op])
             known_blobs.update(op.output)
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index 2f2b5aced6640e..b407f96d2391f8 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -26,7 +26,7 @@ def OnGPU(gpu_id):
   """
     device_option = caffe2_pb2.DeviceOption()
     device_option.device_type = caffe2_pb2.CUDA
-    device_option.device_id = gpu_id
+    device_option.cuda_gpu_id = gpu_id
     return device_option
 
 
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 7583f863b1f5ad..4b5cddb61d244e 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -268,11 +268,11 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False):
 
 def format_device_option(dev_opt):
     if not dev_opt or not (
-            dev_opt.device_type or dev_opt.device_id or dev_opt.node_name):
+            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
         return None
     return call(
         'DeviceOption',
-        [dev_opt.device_type, dev_opt.device_id, "'%s'" % dev_opt.node_name])
+        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
 
 
 @Printer.register(OperatorDef)
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
index 3178345cf46e21..8d3a362dcdf725 100644
--- a/caffe2/python/numa_test.py
+++ b/caffe2/python/numa_test.py
@@ -27,7 +27,7 @@ def build_test_net(net_name):
 
     gpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_option.device_type = caffe2_pb2.CUDA
-    gpu_device_option.device_id = 0
+    gpu_device_option.cuda_gpu_id = 0
 
     net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
                         device_option=gpu_device_option)
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index 5802e49de526dc..8cc3f9e2fa98eb 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -24,7 +24,7 @@ def __init__(self, init_net, predict_net, workspace, uninitialized):
     @property
     def _name_scope(self):
         if self.predict_net.device_option.device_type == caffe2_pb2.CUDA:
-            return 'gpu_{}'.format(self.predict_net.device_option.device_id)
+            return 'gpu_{}'.format(self.predict_net.device_option.cuda_gpu_id)
         return ''
 
     def run(self, inputs, **kwargs):
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index 8e3817034d435b..2d53027a0a053d 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -91,7 +91,7 @@ def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll):
                     self.assertEqual(proto.tensor.device_detail.device_type,
                                      device_type)
                     if device_type == caffe2_pb2.CUDA:
-                        self.assertEqual(proto.tensor.device_detail.device_id,
+                        self.assertEqual(proto.tensor.device_detail.cuda_gpu_id,
                                          gpu_id)
 
             blobs = [str(i) for i in range(len(arrays))]
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 66ac07dbdca079..9d9bb38e178517 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1216,7 +1216,7 @@ def test_lstm_extract_predictor_net(self):
                     if arg.name == "step_net":
                         for step_op in arg.n.op:
                             self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.device_id)
+                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
                     elif arg.name == 'backward_step_net':
                         self.assertEqual(caffe2_pb2.NetDef(), arg.n)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index ddd5871f7d4b74..0c5b18b0b6ab11 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -83,7 +83,7 @@ def make_unique_blob_name(self, base_str):
 
         if current_scope.device_type == caffe2_pb2.CUDA:
             return self.get_gpu_blob_name(
-                base_str, current_scope.device_id, current_scope.node_name
+                base_str, current_scope.cuda_gpu_id, current_scope.node_name
             )
         else:
             return self.get_cpu_blob_name(base_str, current_scope.node_name)
@@ -279,7 +279,7 @@ def _run(self, net, param_init_net, param_info):
         # to include device information.
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
+            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
             shape=[1],
             value=1.0
         )
@@ -488,12 +488,12 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.device_id),
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
         WD = param_init_net.ConstantFill(
-            [], "wd_{}_{}".format(dev.device_type, dev.device_id),
+            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1], value=self.weight_decay
         )
 
@@ -1160,7 +1160,7 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.device_id),
+            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
index ef11246bdfcc9b..b4c71535debe66 100644
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -193,7 +193,7 @@ def test_load_device_scope(self):
 
         # check device options
         for op in list(init_net.Proto().op) + list(predict_init_net.Proto().op):
-            self.assertEqual(1, op.device_option.device_id)
+            self.assertEqual(1, op.device_option.cuda_gpu_id)
             self.assertEqual(caffe2_pb2.CPU, op.device_option.device_type)
 
     def test_db_fails_without_params(self):
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 6db4ae42b84742..679152c788132e 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -34,7 +34,7 @@ class DLPackWrapper {
         "Unsupported device type: ",
         device_option.device_type());
     tensor_context.device_type = *device_type_ptr;
-    tensor_context.device_id = device_option.device_id();
+    tensor_context.device_id = device_option.cuda_gpu_id();
 
     if (tensor->size() <= 0) {
       tensor->Resize(0);
@@ -87,7 +87,7 @@ class DLPackWrapper {
     int dlpack_device_id = dlTensor->ctx.device_id;
     CAFFE_ENFORCE_EQ(
         dlpack_device_id,
-        device_option.device_id(),
+        device_option.cuda_gpu_id(),
         "Expected same device id for DLPack and C2 tensors");
 
     std::vector<int64_t> dims;
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index dd80282238a80b..dc8e088eba97c5 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -30,7 +30,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
     case PROTO_CPU:
       return option.numa_node_id();
     case PROTO_CUDA:
-      return option.device_id();
+      return option.cuda_gpu_id();
     case PROTO_MKLDNN:
       return option.numa_node_id();
     case PROTO_HIP:
@@ -43,7 +43,7 @@ C10_EXPORT int DeviceId(const DeviceOption& option) {
 C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
   return (
       lhs.device_type() == rhs.device_type() &&
-      lhs.device_id() == rhs.device_id() &&
+      lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
       lhs.hip_gpu_id() == rhs.hip_gpu_id() &&
       lhs.node_name() == rhs.node_name() &&
       lhs.numa_node_id() == rhs.numa_node_id());
diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
index 5d8fb86b34e3bb..c9f37f4c98c290 100644
--- a/caffe2/utils/proto_utils_test.cc
+++ b/caffe2/utils/proto_utils_test.cc
@@ -11,9 +11,9 @@ TEST(ProtoUtilsTest, IsSameDevice) {
   EXPECT_FALSE(IsSameDevice(a, b));
   b.set_node_name("my_node");
   EXPECT_TRUE(IsSameDevice(a, b));
-  b.set_device_id(2);
+  b.set_cuda_gpu_id(2);
   EXPECT_FALSE(IsSameDevice(a, b));
-  a.set_device_id(2);
+  a.set_cuda_gpu_id(2);
   EXPECT_TRUE(IsSameDevice(a, b));
   a.set_device_type(DeviceTypeProto::PROTO_CUDA);
   b.set_device_type(DeviceTypeProto::PROTO_CPU);
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 3a98a4cb7d9f3e..113403fd87bbf4 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -2216,7 +2216,7 @@
     "CURAND_ENFORCE" :("HIPRAND_ENFORCE", API_CAFFE2),
     "curandGenerateUniform" : ("hiprandGenerateUniform", API_CAFFE2),
     "curand_generator" : ("hiprand_generator", API_CAFFE2),
-    "device_id" : ("hip_gpu_id", API_CAFFE2),
+    "cuda_gpu_id" : ("hip_gpu_id", API_CAFFE2),
     "CaffeCudaGetDevice" : ("CaffeHipGetDevice", API_CAFFE2),
 }
 

From 1d3f650ce4f1b781f03e8b4f250a25d5a8f819cc Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dzhulgakov@fb.com>
Date: Tue, 2 Oct 2018 00:31:42 -0700
Subject: [PATCH 82/82] Revert D10098106: [pytorch][PR] [WIP] New version of
 PT1 model format

Differential Revision:
D10098106

Original commit changeset: 94ec7fc57c84

fbshipit-source-id: 38f729b0970618f38359797b806cbbcd865f4715
---
 caffe2/core/blob_serialization.cc |  15 +-
 caffe2/proto/caffe2.proto         |  86 +----
 caffe2/proto/torch.proto          | 564 +++++++++++++++++++++++++++---
 caffe2/python/convert.py          |  56 +++
 caffe2/python/convert_test.py     | 234 +++++++++++++
 caffe2/python/pybind_state.cc     |  43 ++-
 caffe2/python/workspace.py        |   4 +-
 caffe2/utils/proto_convert.cc     | 181 ++++++++++
 caffe2/utils/proto_convert.h      |  14 +
 9 files changed, 1058 insertions(+), 139 deletions(-)

diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index f27d16adf342f8..8126b3d59425a1 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -308,12 +308,6 @@ void TensorSerializer::Serialize(
             const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
         proto.add_string_data(SerializeBlob(temp_blob, ""));
       }
-    } break;
-    case TensorProto_DataType_SPECIAL: {
-      CAFFE_THROW("SPECIAL Tensor is not handled yet.");
-    } break;
-    case TensorProto_DataType_NO_CONTENT: {
-      CAFFE_THROW("NO_CONTENT Tensor should not be serialized.");
     } break;
       // Note: we intentially do not provide "default:" so if any new data types
       // are added, the compiler should warn the user to add the case here.
@@ -526,14 +520,7 @@ void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
                 (i + chunkBegin) * temp_blob.meta().itemsize(),
             1);
       }
-    } break;
-    case TensorProto_DataType_SPECIAL: {
-      CAFFE_THROW("SPECIAL Tensor is not handled yet.");
-    } break;
-    case TensorProto_DataType_NO_CONTENT: {
-      CAFFE_THROW("NO_CONTENT Tensor should not be deserialized.");
-    } break;
-      // Note: we intentially do not provide "default:" so if any new data types
+    }
   }
   context->FinishDeviceComputation();
 }
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 9dc745edbdf0d9..71870010293492 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -15,46 +15,23 @@ package caffe2;
 message TensorProto {
   // The dimensions in the tensor.
   repeated int64 dims = 1;
-  // The strides of the tensor.
-  repeated int64 strides = 12;
-
-  // Data type
   enum DataType {
     UNDEFINED = 0;
-
-    // Basic types
-    FLOAT = 1;     // float
-    INT32 = 2;     // int
-    BYTE = 3;      // byte, when deserialized, is going to be restored as uint8
-    STRING = 4;    // string
-
-    // Less-commonly used data types
-    BOOL = 5;      // bool
-    UINT8 = 6;     // uint8_t
-    INT8 = 7;      // int8_t
-    UINT16 = 8;    // uint16_t
-    INT16 = 9;     // int16_t
-    INT64 = 10;    // int64_t
+    FLOAT = 1;  // float
+    INT32 = 2;  // int
+    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
+    STRING = 4;  // string
+    // Less-commonly used data types.
+    BOOL = 5;  // bool
+    UINT8 = 6;  // uint8_t
+    INT8 = 7;  // int8_t
+    UINT16 = 8;  // uint16_t
+    INT16 = 9;  // int16_t
+    INT64 = 10;  // int64_t
     FLOAT16 = 12;  // at::Half
-    DOUBLE = 13;   // double
-
-    // Special data type, type information is stored in the special type field
-    SPECIAL = 51;
-    // Use TensorProto to specify the shape and type
-    NO_CONTENT = 52;
+    DOUBLE = 13;  // double
   }
   optional DataType data_type = 2 [default = FLOAT];
-  // if data_type is SPECIAL, use this field to express the type info
-  optional SpecialType special_type = 13;
-
-  // Data storage
-  enum StorageType {
-    TYPED = 1;
-    RAW = 2;
-    EXTERNAL = 3;
-    ALIAS = 4;
-  }
-  optional StorageType storage_type = 14 [default = TYPED];
   // For float
   repeated float float_data = 3 [packed = true];
   // For int32, uint8, int8, uint16, int16, bool, and float16
@@ -69,13 +46,6 @@ message TensorProto {
   repeated double double_data = 9 [packed = true];
   // For int64
   repeated int64 int64_data = 10 [packed = true];
-  // For raw data
-  optional bytes raw_data = 15;
-  // External data by file name
-  optional string external_data = 16;
-  // For argument, to share the content
-  optional string alias = 17;
-
   // Optionally, a name for the tensor.
   optional string name = 7;
 
@@ -83,23 +53,13 @@ message TensorProto {
   // it was serialized from. This is useful in cases like snapshotting a whole
   // workspace in a multi-GPU environment.
   optional DeviceOption device_detail = 8;
-
   // When loading from chunks this is going to indicate where to put data in the
   // full array. When not used full data have to be present
   message Segment {
     required int64 begin = 1;
     required int64 end = 2;
-    optional int64 chunk_num = 51;
-    optional int64 chunk_id = 52;
   }
   optional Segment segment = 11;
-  optional string debug_info = 18;
-
-  // For PyTorch serialized tensor.
-  optional bool require_gradient = 19;
-  optional bool is_buffer = 20;
-
-  repeated Argument annotations = 21;
 }
 
 message QTensorProto {
@@ -126,11 +86,7 @@ message TensorShape {
   repeated int32 unknown_dims = 3;
   optional bool unknown_shape = 4 [default = false];
   optional string name = 5;
-}
 
-// This is prepared for non-tensor types.
-message SpecialType {
-  optional string name = 1;
 }
 
 message TensorShapes {
@@ -141,17 +97,13 @@ message TensorShapes {
 // values, or repeated float, int and string arrays.
 message Argument {
   optional string name = 1;
-
   optional float f = 2;
   optional int64 i = 3;
   optional bytes s = 4;
-  optional TensorProto t = 10;
   optional NetDef n = 8;
-
   repeated float floats = 5;
   repeated int64 ints = 6;
   repeated bytes strings = 7;
-  repeated TensorProto tensors = 11;
   repeated NetDef nets = 9;
 }
 
@@ -200,11 +152,7 @@ message DeviceOption {
 // Operator Definition.
 message OperatorDef {
   repeated string input = 1; // the name of the input blobs
-  // the input name in the schema, for named inputs
-  repeated string mapped_inputs = 11;
   repeated string output = 2; // the name of output top blobs
-  // the outputname in the schema, for named outputs
-  repeated string mapped_outputs = 12;
   optional string name = 3; // the operator name. This is optional.
   // the operator type. This is needed to create the object from the operator
   // registry.
@@ -238,16 +186,6 @@ message OperatorDef {
   // This is an optional string with no assumed characteristics as
   // operators can be constructed in any language.
   optional string debug_info = 10;
-
-  // additional annotations
-  repeated Argument annotations = 13;
-
-  // for jit ir exporting
-  optional string aten_function = 14;
-
-  // for operator versioning
-  optional string domain = 15;
-  optional string op_version = 16;
 }
 
 // Network definition.
diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto
index f31c3b65ec1894..43dfd02b14c8cc 100644
--- a/caffe2/proto/torch.proto
+++ b/caffe2/proto/torch.proto
@@ -4,77 +4,547 @@ import "caffe2/proto/caffe2.proto";
 
 package torch;
 
-enum ProtoVersion {
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short.
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+//
+// To simplify framework compatibility, ONNX is defined using the subset of
+// protobuf that is compatible with both protobuf v2 and v3. This means that we
+// do not use any protobuf features that are only available in one of the two
+// versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
   _START_VERSION = 0;
-  IR_VERSION_NEWEST = 0x0000000000000101;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control.
+  // For the IR, we are using simple numbers starting with with 0x00000001,
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_NEWEST_ONNX = 0x0000000000000003;
+
+  // PYTORCH IR VERSION
+  IR_VERSION_NEWEST = 0x0000000000000103;
 }
 
-message MethodDef {
-  // method name
-  optional string name = 1; // method name
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  optional string name = 1;           // namespace Attribute
 
-  // static graph
-  optional caffe2.NetDef graph = 2;
-  // method is represented as torch script
-  optional string torch_script = 3;
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string ref_attr_name = 21;
 
-  // the names of inputs and outputs
-  repeated string inputs = 4;
-  repeated string outputs = 5;
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  optional string doc_string = 13;
 
-  // whether this method is main or not.
-  // by default, `forward` should the main method.
-  optional bool is_main = 6;
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field hueristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accomodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
 
-  optional string debug_info = 7;
+  // Exactly ONE of the following fields must be present for this version of the IR
+  optional float f = 2;               // float
+  optional int64 i = 3;               // int
+  optional bytes s = 4;               // UTF-8 string
+  optional TensorProto t = 5;         // tensor value
+  optional GraphProto g = 6;          // graph
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
 
-  repeated caffe2.Argument annotations = 8;
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
 }
 
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  optional string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR.
+  optional TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  optional string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
 
-message ModuleDef {
-  repeated ModuleDef submodules = 1;
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  optional string name = 3;     // namespace Node
 
-  // We suppose to store the modules in one of the following format:
-  //   - methods (static graph or torch script)
-  //   - pickle
-  //   - cpp_arena
-  repeated MethodDef methods = 2;
-  // because the old pickle modules may not be supported by torch_script,
-  // have to stored as pickle_arena at this moment.
-  optional bytes pickle_arena = 3;
-  // should be exposed by the Class Archive, so user can save
-  // module specific data which cannot be store in the graph or torch_script
-  optional bytes cpp_arena = 4;
+  // The symbolic identifier of the Operator to execute.
+  optional string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  optional string domain = 7;   // namespace Domain
 
-  // the names of inputs and outputs of the module are inferred
-  // from the main method.
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
 
-  optional string debug_info = 5;
+  // A human-readable documentation for this node. Markdown is allowed.
+  // Equivalent to string debug_info
+  optional string doc_string = 6;
 
-  repeated caffe2.Argument annotations = 6;
+  // Additional annotations, attributes are defined in Schema
+  // To be added as annotations:
+  //    string engine
+  //    string list control_input
+  //    int64 is_gradient_op
+  repeated AttributeProto annotations = 8;
+
+  // Besides the node type, PyTorhc also serialize ATen function signature
+  optional caffe2.DeviceOption device_option = 51;
+  optional string aten_function = 52;
 }
 
-message ModelDef {
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto.
+//
+// Model ==> Caffe2 MetaNetDef
+//       ==> PyTorch Module
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
   optional int64 ir_version = 1;
 
-  // main module of the model
-  optional ModuleDef main_module = 2;
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  optional string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  optional int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  optional string doc_string = 6;
 
-  repeated caffe2.TensorProto parameters = 3;
-  repeated caffe2.TensorProto value_infos = 4;
+  // The parameterized graph that is evaluated to execute the model.
+  // The main graph, in single graph case, it is ONNX compatible.
+  optional GraphProto graph = 7;
 
-  // to distinguish whether exported from c2 or torch
-  optional string producer_name = 5;
+  // The remaining nets in MetaNetDef.
+  // Submodules and methods in PyTorch.
+  repeated GraphProto methods = 15;
+
+  // Named metadata values; keys should be distinct.
+  // Many meta data in MetaNetDef and preditor are piggy backed here.
+  // 1) project
+  // 2) model_class
+  // 3) internal_version
+  // 4) predictor_type
+  // 5) predictor_id
+  // 6) execute_plan
+  // 7) applicationSpecificInfo (another string map, need to verify it has no duplicate.)
+  // 8) engine
+  // 9) publish time
+  repeated StringStringEntryProto metadata_props = 14;
+
+  // Model name
+  optional string name = 16;
+
+  // Model name
+  repeated AttributeProto annotations = 17;
+
+  // Mapping from list name to blob name list, must be string list type.
+  // Equivalent to blobs in MetaNetDef.
+  repeated AttributeProto blob_lists = 51;
+
+  // Mapping from plan name to serialized plan, must be string list type.
+  // Equivalent to plans in MetaNetDef.
+  repeated AttributeProto plans = 52;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  optional string key = 1;
+  optional string value= 2;
+};
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+// Graph ==> NetDef in Caffe2
+//       ==> Submodule/Method in PyTorch
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  optional string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // also appears in the input list.
+  repeated TensorProto initializer = 5;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  optional string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // Additional annotations.
+  repeated AttributeProto annotations = 14;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
 
-  // put build version here
-  optional string producer_version = 6;
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
 
-  optional string name = 7;
+    // Advanced types
+    FLOAT16 = 10;
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+    // Future extensions go here.
 
-  optional string debug_info = 8;
+    // Special data type, real type information is stored in ValueInfoProto.
+    // If data_type is SPECIAL, raw_data should be used.
+    SPECIAL = 51;
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+  repeated int64 strides = 14;
+
+  // The data type of the tensor.
+  optional DataType data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    optional int64 begin = 1;
+    optional int64 end = 2;
+    optional int64 chuck_num = 51;
+    optional int64 chuck_id = 52;
+  }
+  // Used as offset in the external shared data.
+  optional Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and Half values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, INT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  optional string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  optional string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  optional bytes raw_data = 9;
+
+  // For double
+  // Complex64 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+
+  // External data by file name
+  optional string external_data = 13;
+
+  // If two tensors represent the same weights/content, use alias.
+  // Must exist a TensorProto named alias in the initializer list.
+  // To avoid the duplicate tensor in attribute, such as value in Constant node.
+  // This is useful, if everything is stored just in the proto.
+  optional string alias = 16;
+
+  // Additional annotations.
+  repeated AttributeProto annotations = 17;
+
+  // Device info
+  optional caffe2.DeviceOption device_option = 51;
+
+  // For PyTorch serialized tensor.
+  optional int64 require_gradient = 52;
+  optional int64 is_buffer = 53;
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    optional string denotation = 3;
+  };
+  // To represent a scalar, using no dim to represent 0-d tensor.
+  repeated Dimension dim = 1;
+
+  repeated Dimension stride = 51;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST be present for this version of the IR.
+    optional TensorProto.DataType elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+  // Sequence type: List, Tuple
+  message Sequence {
+    // elem_type and elem_type_list cannot appear together.
+    // If all the element types are the same, we use elem_type,
+    // otherwise, we specify the type of each element in elem_type_list.
+    optional TypeProto elem_type = 1;
+    repeated TypeProto elem_type_list = 51;
+    enum SequenceType {
+      UNDEFINED = 0;
+      LIST = 1;
+      TUPLE = 2;
+    }
+    optional SequenceType sequence_type = 52;
+  }
+
+  // Map<K, V>, (not necessary at this moment)
+  message Map {
+    optional TensorProto.DataType key_type = 1;
+    optional TypeProto value_type = 2;
+  }
+
+  // Special type of blobs, based on the type_name, we can choose the right
+  // serializer and deserialzier.
+  message SpecialBlob {
+    optional string type_name = 1;
+  }
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+    Sequence sequence_type = 4;
+    Map map_type = 5;
+    SpecialBlob special_type = 51;
+  }
+
+  // An optional denotation can be used to denote the whole
+  // type with a standard semantic description as to what is
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  optional string denotation = 6;
+}
 
-  // annotations, it is used for MetaNetDef's metadata
-  repeated caffe2.Argument annotations = 9;
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  optional string domain = 1;
 
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  optional int64 version = 2;
 }
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
index 44f81d6e2d135e..50eaf220c7f721 100644
--- a/caffe2/python/convert.py
+++ b/caffe2/python/convert.py
@@ -8,3 +8,59 @@
 from caffe2.proto import caffe2_pb2, torch_pb2
 
 import caffe2.python._import_c_extension as C
+
+
+def ArgumentToAttributeProto(arg):
+    serialized_arg = None
+    if hasattr(arg, 'SerializeToString') and callable(arg.SerializeToString):
+        serialized_arg = arg.SerializeToString()
+    elif isinstance(arg, bytes):
+        serialized_arg = arg
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither arg is bytes.\ntype is {}'.format(type(arg)))
+    attr = torch_pb2.AttributeProto()
+    attr.ParseFromString(C.argument_to_attribute_proto(serialized_arg))
+    return attr
+
+
+def AttributeProtoToArgument(attr):
+    serialized_attr = None
+    if hasattr(attr, 'SerializeToString') and callable(attr.SerializeToString):
+        serialized_attr = attr.SerializeToString()
+    elif isinstance(attr, bytes):
+        serialized_attr = attr
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither attr is bytes.\ntype is {}'.format(type(attr)))
+    arg = caffe2_pb2.Argument()
+    arg.ParseFromString(C.attribute_proto_to_argument(serialized_attr))
+    return arg
+
+
+def OperatorDefToNodeProto(op_def):
+    serialized_op_def = None
+    if hasattr(op_def, 'SerializeToString') and callable(op_def.SerializeToString):
+        serialized_op_def = op_def.SerializeToString()
+    elif isinstance(op_def, bytes):
+        serialized_op_def = op_def
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither op_def is bytes.\ntype is {}'.format(type(op_def)))
+    node = torch_pb2.NodeProto()
+    node.ParseFromString(C.operator_def_to_node_proto(serialized_op_def))
+    return node
+
+
+def NodeProtoToOperatorDef(node_proto):
+    serialized_node_proto = None
+    if hasattr(node_proto, 'SerializeToString') and callable(node_proto.SerializeToString):
+        serialized_node_proto = node_proto.SerializeToString()
+    elif isinstance(node_proto, bytes):
+        serialized_node_proto = node_proto
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither node_proto is bytes.\ntype is {}'.format(type(node_proto)))
+    op_def = caffe2_pb2.OperatorDef()
+    op_def.ParseFromString(C.node_proto_to_operator_def(serialized_node_proto))
+    return op_def
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
index 82c969c901ea61..c8de7e9750680f 100644
--- a/caffe2/python/convert_test.py
+++ b/caffe2/python/convert_test.py
@@ -12,5 +12,239 @@ class TestOperator(unittest.TestCase):
     def setUp(self):
         workspace.ResetWorkspace()
 
+    def testArgument2AttributeProto(self):
+        arg_f = caffe2_pb2.Argument()
+        arg_f.name = "TestArgF"
+        arg_f.f = 10.0
+        attr_f = convert.ArgumentToAttributeProto(arg_f)
+        self.assertEqual(attr_f.name, arg_f.name)
+        self.assertEqual(attr_f.f, arg_f.f)
+
+        arg_i = caffe2_pb2.Argument()
+        arg_i.name = "TestArgI"
+        arg_i.i = 100
+        attr_i = convert.ArgumentToAttributeProto(arg_i)
+        self.assertEqual(attr_i.name, arg_i.name)
+        self.assertEqual(attr_i.i, arg_i.i)
+
+        arg_s = caffe2_pb2.Argument()
+        arg_s.name = "TestArgS"
+        arg_s.s = "TestS".encode("utf-8")
+        attr_s = convert.ArgumentToAttributeProto(arg_s)
+        self.assertEqual(attr_s.name, arg_s.name)
+        self.assertEqual(attr_s.s, arg_s.s)
+
+        # TODO: test net arg
+
+        arg_floats = caffe2_pb2.Argument()
+        arg_floats.name = "TestArgFloats"
+        arg_floats.floats.extend([10.0, 11.0, 12.0])
+        attr_floats = convert.ArgumentToAttributeProto(arg_floats)
+        self.assertEqual(attr_floats.name, arg_floats.name)
+        self.assertEqual(attr_floats.floats, arg_floats.floats)
+
+        arg_ints = caffe2_pb2.Argument()
+        arg_ints.name = "TestArgInts"
+        arg_ints.ints.extend([100, 101, 102])
+        attr_ints = convert.ArgumentToAttributeProto(arg_ints)
+        self.assertEqual(attr_ints.name, arg_ints.name)
+        self.assertEqual(attr_ints.ints, arg_ints.ints)
+
+        arg_strings = caffe2_pb2.Argument()
+        arg_strings.name = "TestArgStrings"
+        arg_strings.strings.extend([
+            "TestStrings1".encode("utf-8"),
+            "TestStrings2".encode("utf-8"),
+        ])
+        attr_strings = convert.ArgumentToAttributeProto(arg_strings)
+        self.assertEqual(attr_strings.name, arg_strings.name)
+        self.assertEqual(attr_strings.strings, arg_strings.strings)
+
+        # TODO: test nets arg
+
+    def testAttributeProto2Argument(self):
+        attr_f = torch_pb2.AttributeProto()
+        attr_f.type = torch_pb2.AttributeProto.FLOAT
+        attr_f.name = "TestAttrF"
+        attr_f.f = 10.0
+        arg_f = convert.AttributeProtoToArgument(attr_f)
+        self.assertEqual(arg_f.name, attr_f.name)
+        self.assertEqual(arg_f.f, attr_f.f)
+
+        attr_i = torch_pb2.AttributeProto()
+        attr_i.type = torch_pb2.AttributeProto.INT
+        attr_i.name = "TestArgI"
+        attr_i.i = 100
+        arg_i = convert.AttributeProtoToArgument(attr_i)
+        self.assertEqual(arg_i.name, attr_i.name)
+        self.assertEqual(arg_i.i, attr_i.i)
+
+        attr_s = torch_pb2.AttributeProto()
+        attr_s.type = torch_pb2.AttributeProto.STRING
+        attr_s.name = "TestArgS"
+        attr_s.s = "TestS".encode("utf-8")
+        arg_s = convert.AttributeProtoToArgument(attr_s)
+        self.assertEqual(arg_s.name, attr_s.name)
+        self.assertEqual(arg_s.s, attr_s.s)
+
+        # TODO: test graph attribute
+
+        attr_floats = torch_pb2.AttributeProto()
+        attr_floats.type = torch_pb2.AttributeProto.FLOATS
+        attr_floats.name = "TestAttrFloats"
+        attr_floats.floats.extend([10.0, 11.0, 12.0])
+        arg_floats = convert.AttributeProtoToArgument(attr_floats)
+        self.assertEqual(arg_floats.name, attr_floats.name)
+        self.assertEqual(arg_floats.floats, attr_floats.floats)
+
+        attr_ints = torch_pb2.AttributeProto()
+        attr_ints.type = torch_pb2.AttributeProto.INTS
+        attr_ints.name = "TestArgInts"
+        attr_ints.ints.extend([100, 101, 102])
+        arg_ints = convert.AttributeProtoToArgument(attr_ints)
+        self.assertEqual(arg_ints.name, attr_ints.name)
+        self.assertEqual(arg_ints.ints, attr_ints.ints)
+
+        attr_strings = torch_pb2.AttributeProto()
+        attr_strings.type = torch_pb2.AttributeProto.STRINGS
+        attr_strings.name = "TestArgStrings"
+        attr_strings.strings.extend([
+            "TestStrings1".encode("utf-8"),
+            "TestStrings2".encode("utf-8"),
+        ])
+        arg_strings = convert.AttributeProtoToArgument(attr_strings)
+        self.assertEqual(arg_strings.name, attr_strings.name)
+        self.assertEqual(arg_strings.strings, attr_strings.strings)
+
+        # TODO: test graphs attribute
+
+
+    def testOperatorDef2NodeProto(self):
+        op_def = caffe2_pb2.OperatorDef()
+        op_def.input.extend(["A", "B", "C"])
+        op_def.output.extend(["X", "Y"])
+        op_def.name = "TestOpName"
+        op_def.type = "TestOp"
+        arg1 = caffe2_pb2.Argument()
+        arg1.name = "TestArg1"
+        arg1.i = 1
+        arg2 = caffe2_pb2.Argument()
+        arg2.name = "TestArg2"
+        arg1.s = "TestInfo".encode("utf-8")
+        op_def.arg.extend([arg1, arg2])
+        op_def.device_option.CopyFrom(caffe2_pb2.DeviceOption())
+        op_def.engine = "TestEngine".encode("utf-8")
+        op_def.control_input.extend(["input1", "input2"])
+        op_def.is_gradient_op = True
+        op_def.debug_info = "TestDebugInfo"
+
+        node = convert.OperatorDefToNodeProto(op_def)
+
+        self.assertEqual(node.input, op_def.input)
+        self.assertEqual(node.output, op_def.output)
+        self.assertEqual(node.name, op_def.name)
+        self.assertEqual(node.op_type, op_def.type)
+        self.assertEqual(node.attribute[0].name, op_def.arg[0].name)
+        self.assertEqual(node.attribute[1].name, op_def.arg[1].name)
+        self.assertEqual(node.device_option, op_def.device_option)
+        node_engine = [a.s.decode("utf-8") for a in node.annotations if a.name == "engine"][0]
+        self.assertEqual(node_engine, op_def.engine)
+        node_control_input = [a.strings for a in node.annotations if a.name == "control_input"][0]
+        self.assertEqual(len(node_control_input), len(op_def.control_input))
+        for x, y in zip(node_control_input, op_def.control_input):
+            self.assertEqual(x.decode("utf-8"), y)
+        self.assertEqual(node.doc_string, op_def.debug_info)
+        node_is_gradient_op = [a.i for a in node.annotations if a.name == "is_gradient_op"][0]
+        self.assertEqual(node_is_gradient_op, int(op_def.is_gradient_op))
+
+    def testNodeProto2OperatorDef(self):
+        node = torch_pb2.NodeProto()
+        node.input.extend(["A", "B", "C"])
+        node.output.extend(["X", "Y"])
+        node.name = "TestOpName"
+        node.op_type = "TestOp"
+        attr1 = torch_pb2.AttributeProto()
+        attr1.name = "TestAttr1"
+        attr1.type = torch_pb2.AttributeProto.STRING
+        attr1.s = "TestInfo".encode("utf-8")
+        attr2 = torch_pb2.AttributeProto()
+        attr2.name = "TestAttr2"
+        attr2.type = torch_pb2.AttributeProto.INT
+        attr2.i = 10
+        node.attribute.extend([attr1, attr2])
+        node.device_option.CopyFrom(caffe2_pb2.DeviceOption())
+        anno1 = torch_pb2.AttributeProto()
+        anno1.name = "engine"
+        anno1.type = torch_pb2.AttributeProto.STRING
+        anno1.s = "TestEngine".encode("utf-8")
+        anno2 = torch_pb2.AttributeProto()
+        anno2.name = "control_input"
+        anno2.type = torch_pb2.AttributeProto.STRINGS
+        anno2.strings.extend(["input1".encode("utf-8"), "input2".encode("utf-8")])
+        anno3 = torch_pb2.AttributeProto()
+        anno3.name = "is_gradient_op"
+        anno3.type = torch_pb2.AttributeProto.INT
+        anno3.i = 1
+        node.annotations.extend([anno1, anno2, anno3])
+        node.doc_string = "TestDocString".encode("utf-8")
+
+        op_def = convert.NodeProtoToOperatorDef(node)
+
+        self.assertEqual(op_def.input, node.input)
+        self.assertEqual(op_def.output, node.output)
+        self.assertEqual(op_def.name, node.name)
+        self.assertEqual(op_def.type, node.op_type)
+        self.assertEqual(op_def.arg[0].name, node.attribute[0].name)
+        self.assertEqual(op_def.arg[1].name, node.attribute[1].name)
+        self.assertEqual(op_def.device_option, node.device_option)
+        node_engine = [a.s for a in node.annotations if a.name == "engine"][0]
+        self.assertEqual(op_def.engine, node_engine.decode("utf-8"))
+        node_control_input = [a.strings for a in node.annotations if a.name == "control_input"][0]
+        for x, y in zip(op_def.control_input, node_control_input):
+            self.assertEqual(x, y.decode("utf-8"))
+        self.assertEqual(op_def.debug_info, node.doc_string)
+        node_is_gradient_op = [a.i for a in node.annotations if a.name == "is_gradient_op"][0]
+        self.assertEqual(int(op_def.is_gradient_op), node_is_gradient_op)
+
+    def testEnd2End(self):
+        op_def = caffe2_pb2.OperatorDef()
+        op_def.type = "Add"
+        op_def.input.extend(["input1"])
+        op_def.input.extend(["input2"])
+        op_def.output.extend(["output1"])
+        node = convert.OperatorDefToNodeProto(op_def)
+
+        input1 = np.random.randn(1, 3, 1, 5).astype(np.float32)
+        input2 = np.random.randn(2, 1, 4, 1).astype(np.float32)
+        ref_output1 = input1 + input2
+        workspace.FeedBlob("input1", input1)
+        workspace.FeedBlob("input2", input2)
+        self.assertEqual(workspace.RunOperatorOnce(node.SerializeToString(), legacy_proto=False), True)
+
+        self.assertEqual(workspace.HasBlob("output1"), True)
+        fetched_back = workspace.FetchBlob("output1")
+        np.testing.assert_array_equal(fetched_back, ref_output1)
+
+    def testRoundTrip(self):
+        op_def = caffe2_pb2.OperatorDef()
+        op_def.type = "Add"
+        op_def.input.extend(["input1"])
+        op_def.input.extend(["input2"])
+        op_def.output.extend(["output1"])
+        node = convert.OperatorDefToNodeProto(op_def)
+        new_op_def = convert.NodeProtoToOperatorDef(node)
+
+        input1 = np.random.randn(1, 3, 1, 5).astype(np.float32)
+        input2 = np.random.randn(2, 1, 4, 1).astype(np.float32)
+        ref_output1 = input1 + input2
+        workspace.FeedBlob("input1", input1)
+        workspace.FeedBlob("input2", input2)
+        self.assertEqual(workspace.RunOperatorOnce(new_op_def.SerializeToString()), True)
+
+        self.assertEqual(workspace.HasBlob("output1"), True)
+        fetched_back = workspace.FetchBlob("output1")
+        np.testing.assert_array_equal(fetched_back, ref_output1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 7ebee57d490848..7062ead045df1c 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -1187,10 +1187,17 @@ void addGlobalMethods(py::module& m) {
     return true;
   });
   m.def("nets", []() { return gWorkspace->Nets(); });
-  m.def("run_operator_once", [](const py::bytes& op_def) {
+  m.def("run_operator_once", [](const py::bytes& op_def, bool legacy_proto=true) {
     CAFFE_ENFORCE(gWorkspace);
     OperatorDef def;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
+    if (legacy_proto) {
+      CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
+    } else {
+      ::torch::NodeProto node;
+      CAFFE_ENFORCE(
+          ParseProtoFromLargeString(op_def.cast<std::string>(), &node));
+      NodeProtoToOperatorDef(node, &def);
+    }
     py::gil_scoped_release g;
     CAFFE_ENFORCE(gWorkspace->RunOperatorOnce(def));
     return true;
@@ -1527,6 +1534,38 @@ void addGlobalMethods(py::module& m) {
     CAFFE_ENFORCE(blob);
     return BlobStat::sizeBytes(*blob);
   });
+  m.def("argument_to_attribute_proto", [](py::bytes arg_str) -> py::bytes {
+    Argument arg;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(arg_str.cast<std::string>(), &arg));
+    ::torch::AttributeProto attr;
+    ArgumentToAttributeProto(arg, &attr);
+    return attr.SerializeAsString();
+  });
+  m.def("attribute_proto_to_argument", [](py::bytes attr_str) -> py::bytes {
+    ::torch::AttributeProto attr;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(attr_str.cast<std::string>(), &attr));
+    Argument arg;
+    AttributeProtoToArgument(attr, &arg);
+    return arg.SerializeAsString();
+  });
+  m.def("operator_def_to_node_proto", [](py::bytes op_str) -> py::bytes {
+    OperatorDef op_def;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(op_str.cast<std::string>(), &op_def));
+    ::torch::NodeProto node;
+    OperatorDefToNodeProto(op_def, &node);
+    return node.SerializeAsString();
+  });
+  m.def("node_proto_to_operator_def", [](py::bytes node_str) -> py::bytes {
+    ::torch::NodeProto node_proto;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(node_str.cast<std::string>(), &node_proto));
+    OperatorDef op_def;
+    NodeProtoToOperatorDef(node_proto, &op_def);
+    return op_def.SerializeAsString();
+  });
   m.def("support_onnx_export", [](const std::string& op) -> bool {
     const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(op);
     if (!schema) {
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index ef02f64dc993b7..a41cc153177639 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -163,8 +163,8 @@ def GetOperatorCost(operator, blobs):
     return C.get_operator_cost(StringifyProto(operator), blobs)
 
 
-def RunOperatorOnce(operator):
-    return C.run_operator_once(StringifyProto(operator))
+def RunOperatorOnce(operator, legacy_proto=True):
+    return C.run_operator_once(StringifyProto(operator), legacy_proto)
 
 
 def RunOperatorsOnce(operators):
diff --git a/caffe2/utils/proto_convert.cc b/caffe2/utils/proto_convert.cc
index 1d69c8c80c15ac..790bd274291dcb 100644
--- a/caffe2/utils/proto_convert.cc
+++ b/caffe2/utils/proto_convert.cc
@@ -2,4 +2,185 @@
 #include "caffe2/core/logging.h"
 
 namespace caffe2 {
+
+C10_EXPORT void ArgumentToAttributeProto(
+    const Argument& arg,
+    ::torch::AttributeProto* attr) {
+  CAFFE_ENFORCE(arg.has_name());
+  attr->set_name(arg.name());
+  if (arg.has_f()) {
+    attr->set_f(arg.f());
+  } else if (arg.has_i()) {
+    attr->set_i(arg.i());
+  } else if (arg.has_s()) {
+    attr->set_s(arg.s());
+  } else if (arg.has_n()) {
+    // TODO
+    CAFFE_THROW("NetDef conversion is not implemented yet.");
+  } else if (arg.floats_size() > 0) {
+    attr->mutable_floats()->CopyFrom(arg.floats());
+  } else if (arg.ints_size() > 0) {
+    attr->mutable_ints()->CopyFrom(arg.ints());
+  } else if (arg.strings_size() > 0) {
+    attr->mutable_strings()->CopyFrom(arg.strings());
+  } else if (arg.nets_size() > 0) {
+    // TODO
+    CAFFE_THROW("NetDefs conversion is not implemented yet.");
+  }
+}
+
+C10_EXPORT void AttributeProtoToArgument(
+    const ::torch::AttributeProto& attr,
+    Argument* arg) {
+  CAFFE_ENFORCE(attr.has_name());
+  arg->set_name(attr.name());
+  CAFFE_ENFORCE(attr.has_type());
+  const auto type = attr.type();
+  if (type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_FLOAT) {
+    CAFFE_ENFORCE(attr.has_f());
+    arg->set_f(attr.f());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::AttributeProto_AttributeType_INT) {
+    CAFFE_ENFORCE(attr.has_i());
+    arg->set_i(attr.i());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_STRING) {
+    CAFFE_ENFORCE(attr.has_s());
+    arg->set_s(attr.s());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_TENSOR) {
+    CAFFE_THROW("Caffe2's Argument does not support tensor as attribute.");
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_GRAPH) {
+    // TODO
+    CAFFE_THROW("GraphProto conversion is not implemented yet.");
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_FLOATS) {
+    arg->mutable_floats()->CopyFrom(attr.floats());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_INTS) {
+    arg->mutable_ints()->CopyFrom(attr.ints());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_STRINGS) {
+    arg->mutable_strings()->CopyFrom(attr.strings());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_TENSORS) {
+    CAFFE_THROW("Caffe2's Argument does not support tensors as attribute.");
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_GRAPHS) {
+    // TODO
+    CAFFE_THROW("GraphProtos conversion is not implemented yet.");
+  } else {
+    CAFFE_THROW("Unknow Attribute type.");
+  }
+}
+
+C10_EXPORT void OperatorDefToNodeProto(
+    const OperatorDef& def,
+    ::torch::NodeProto* node) {
+  node->mutable_input()->CopyFrom(def.input());
+  node->mutable_output()->CopyFrom(def.output());
+  if (def.has_name()) {
+    node->set_name(def.name());
+  }
+  CAFFE_ENFORCE(def.has_type());
+  node->set_op_type(def.type());
+  for (int i = 0; i < def.arg_size(); ++i) {
+    auto attr = node->add_attribute();
+    ArgumentToAttributeProto(def.arg(i), attr);
+  }
+  if (def.has_device_option()) {
+    node->mutable_device_option()->CopyFrom(def.device_option());
+  }
+  if (def.has_engine()) {
+    auto attr = node->add_annotations();
+    attr->set_name("engine");
+    attr->set_type(::torch::AttributeProto_AttributeType::
+                       AttributeProto_AttributeType_STRING);
+    attr->set_s(def.engine());
+  }
+  if (def.control_input_size() > 0) {
+    auto attr = node->add_annotations();
+    attr->set_name("control_input");
+    attr->set_type(::torch::AttributeProto_AttributeType::
+                       AttributeProto_AttributeType_STRINGS);
+    attr->mutable_strings()->CopyFrom(def.control_input());
+  }
+  if (def.has_is_gradient_op()) {
+    auto attr = node->add_annotations();
+    attr->set_name("is_gradient_op");
+    attr->set_type(::torch::AttributeProto_AttributeType::
+                       AttributeProto_AttributeType_INT);
+    if (def.is_gradient_op()) {
+      attr->set_i(1);
+    } else {
+      attr->set_i(0);
+    }
+  }
+  if (def.has_debug_info()) {
+    node->set_doc_string(def.debug_info());
+  }
+}
+
+C10_EXPORT void NodeProtoToOperatorDef(
+    const ::torch::NodeProto& node,
+    OperatorDef* def) {
+  def->mutable_input()->CopyFrom(node.input());
+  def->mutable_output()->CopyFrom(node.output());
+  if (node.has_name()) {
+    def->set_name(node.name());
+  }
+
+  CAFFE_ENFORCE(node.has_op_type());
+  def->set_type(node.op_type());
+  for (int i = 0; i < node.attribute_size(); ++i) {
+    auto arg = def->add_arg();
+    AttributeProtoToArgument(node.attribute(i), arg);
+  }
+  if (node.has_doc_string()) {
+    def->set_debug_info(node.doc_string());
+  }
+  for (int i = 0; i < node.annotations_size(); ++i) {
+    const auto& attr = node.annotations(i);
+    CAFFE_ENFORCE(attr.has_name());
+    if (attr.name() == "engine") {
+      CAFFE_ENFORCE(attr.has_s());
+      def->set_engine(attr.s());
+    } else if (attr.name() == "control_input") {
+      def->mutable_control_input()->CopyFrom(attr.strings());
+    } else if (attr.name() == "is_gradient_op") {
+      CAFFE_ENFORCE(attr.has_i());
+      if (i == 0) {
+        def->set_is_gradient_op(false);
+      } else {
+        def->set_is_gradient_op(true);
+      }
+    }
+    auto arg = def->add_arg();
+    AttributeProtoToArgument(node.annotations(i), arg);
+  }
+  if (node.has_device_option()) {
+    def->mutable_device_option()->CopyFrom(node.device_option());
+  }
+}
+
 } // namespace caffe2
diff --git a/caffe2/utils/proto_convert.h b/caffe2/utils/proto_convert.h
index 91bcf1bafa2298..a9ca9c3ad4fa41 100644
--- a/caffe2/utils/proto_convert.h
+++ b/caffe2/utils/proto_convert.h
@@ -6,6 +6,20 @@
 #include "caffe2/proto/torch_pb.h"
 
 namespace caffe2 {
+
+CAFFE2_API void ArgumentToAttributeProto(
+    const Argument& arg,
+    ::torch::AttributeProto* attr);
+CAFFE2_API void AttributeProtoToArgument(
+    const ::torch::AttributeProto& attr,
+    Argument* arg);
+CAFFE2_API void OperatorDefToNodeProto(
+    const OperatorDef& def,
+    ::torch::NodeProto* node);
+CAFFE2_API void NodeProtoToOperatorDef(
+    const ::torch::NodeProto& node,
+    OperatorDef* def);
+
 } // namespace caffe2
 
 #endif // CAFFE2_UTILS_PROTO_CONVERT_H_