Fixed CudaCodeGen output streams. Switch to __ldg by default (pytorch#148)

zheng-xq · Mikhail Zolotukhin · commit e3177741a10c · 2020-02-18T10:55:22.000-08:00
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -33,7 +33,7 @@ class ScopedVarName {
       const std::string& name)
       : ScopedVarName(&manager->unique_name_mapping_, var, name) {}
 
-  ~ScopedVarName() {
+  ~ScopedVarName() noexcept(false) {
     auto iter = mapping_->find(var_);
     TORCH_CHECK(iter != mapping_->end(), "Invalid var entry");
     mapping_->erase(var_);
@@ -124,29 +124,34 @@ void CudaPrinter::visit(const For* v) {
   }
 }
 
+void CudaPrinter::visit(const Load* v) {
+  // TODO: find a better metric in using ldg or not. Support different dtypes.
+  os() << "__ldg(" << v->base_handle() << " + " << v->index() << ")";
+}
+
 void CudaCodeGen::Initialize() {
   printer_.reset(new CudaPrinter(&oss_));
   // TODO: handle multiple kernels.
   // TODO: handle dynamic dimension.
   // TODO: call nvrtc.
-  oss_ << "extern \"C\" __global__" << std::endl << "void f(";
+  os() << "extern \"C\" __global__" << std::endl << "void f(";
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (int i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
-      oss_ << ", ";
+      os() << ", ";
     }
     const BufferArg& buffer_arg = buffer_args[i];
     const Var& var = buffer_arg.var();
     Dtype dtype = buffer_arg.dtype();
-    oss_ << dtype.ToCppString() << (buffer_arg.isVar() ? " " : "* ")
+    os() << dtype.ToCppString() << (buffer_arg.isVar() ? " " : "* ")
          << name_manager()->get_unique_name(var);
   }
-  oss_ << ") {";
+  os() << ") {";
 
-  oss_ << std::endl;
+  os() << std::endl;
   stmt().accept(printer_.get());
-  oss_ << std::endl;
-  oss_ << "}";
+  os() << std::endl;
+  os() << "}";
 
   // Check that all block extents had been set.
   const std::vector<Expr>& gpu_block_extents = printer_->gpu_block_extents();
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -22,7 +22,7 @@ namespace tensorexpr {
 // A class that overrides the underlying IRPrinter to produce Cuda C.
 class CudaPrinter : public IRPrinter {
  public:
-  explicit CudaPrinter(std::ostream* os) : IRPrinter(*os), os_(os) {}
+  explicit CudaPrinter(std::ostream* os) : IRPrinter(*os) {}
 
   void visit(const Cast* v) {
     auto dtype = v->dtype();
@@ -38,9 +38,7 @@ class CudaPrinter : public IRPrinter {
 
   void visit(const For* v);
 
-  std::ostream& os() {
-    return *os_;
-  }
+  void visit(const Load* v);
 
   const std::vector<Expr>& gpu_block_extents() const {
     return gpu_block_extents_;
@@ -53,7 +51,6 @@ class CudaPrinter : public IRPrinter {
   using IRPrinter::name_manager;
 
  private:
-  std::ostream* os_ = nullptr;
   std::vector<Expr> gpu_block_extents_;
   std::vector<Expr> gpu_thread_extents_;
 };
@@ -94,6 +91,10 @@ class TORCH_API CudaCodeGen : public CodeGen {
     return printer_->name_manager();
   }
 
+  std::ostream& os() {
+    return printer_->os();
+  }
+
   std::ostringstream oss_;
   std::unique_ptr<CudaPrinter> printer_;
   CUfunction function_;
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -64,10 +64,6 @@ class TORCH_API IRPrinter : public IRVisitor {
   }
 
  private:
-  std::ostream& raw_os() {
-    return printer_os_;
-  }
-
   PrinterStream printer_os_;
   UniqueNameManager name_manager_;
 };