NVIDIA · zasdfgbnm · Jun 3, 2025 · May 29, 2025 · May 29, 2025 · May 29, 2025
diff --git a/csrc/scheduler/matmul_hopper+.cpp b/csrc/scheduler/matmul_hopper+.cpp
@@ -34,6 +34,9 @@ namespace schedule_matmul {
 
 namespace {
 
+constexpr int64_t hardcoded_smem_vectorize_factor = 4;
+constexpr int64_t hardcoded_blackwell_splitk_vectorization_factor = 4;
+
 // Find the first MatmulDimRole from left to right in a vector of roles
 int64_t findFirstRole(
     std::vector<MatmulDimRole>& roles,
@@ -649,6 +652,7 @@ int64_t HopperPlus::getLdTMemVectorizeFactor() const {
 
 void HopperPlus::scheduleEpilogueWithoutSmemEpilogueBlackwell() {
   const bool has_splitk = params_->splitk_factor != 1;
+  int64_t tmem_vectorize_factor = getLdTMemVectorizeFactor();
   std::vector<TensorView*> cached_tvs;
   std::vector<TensorView*> propagate_to =
       splitk_sums_.empty() ? mma_results_ : splitk_sums_;
@@ -674,7 +678,6 @@ void HopperPlus::scheduleEpilogueWithoutSmemEpilogueBlackwell() {
     // vectorize the TMem load with a factor of v (tmem_vectorize_factor).
     // [..., Mo * No, Mw, Nw, Mi (TIDx), Ni / v, v (Vectorize)]
     d->axis(-2)->parallelize(ParallelType::TIDx);
-    int64_t tmem_vectorize_factor = getLdTMemVectorizeFactor();
     if (tmem_vectorize_factor < getN(params_->mma_macro)) {
       d->split(-1, tmem_vectorize_factor);
     }
@@ -767,17 +770,14 @@ void HopperPlus::scheduleEpilogueWithoutSmemEpilogue() {
   }
 }
 
-void HopperPlus::scheduleEpilogueWithSmemEpilogue() {
+void HopperPlus::scheduleEpilogueWithSmemEpilogueHopper() {
   constexpr int64_t ldst_matrix_tile_m = 16;
   constexpr int64_t ldst_matrix_tile_n = 16;
   fusion_->manage("ldst_matrix_m_tile", ldst_matrix_tile_m);
   fusion_->manage("ldst_matrix_n_tile", ldst_matrix_tile_n);
   fusion_->manage("ldst_matrix_m_smem", params_->tile_sizes.warp_tile.m);
   fusion_->manage("ldst_matrix_n_smem", params_->tile_sizes.warp_tile.n);
 
-  // Apply LdMatrix to any epilogue inputs loaded to smem with TMA.
-  std::vector<TensorView*> tma_load_epilogue_inputs;
-
   // Propagate to (not including) the splitk output if there is a splitk
   // else this is just mma_results_
   std::vector<TensorView*> propagate_to =
@@ -930,6 +930,117 @@ void HopperPlus::scheduleEpilogueWithSmemEpilogue() {
   }
 }
 
+void HopperPlus::scheduleEpilogueWithSmemEpilogueBlackwell() {
+  const bool has_splitk = params_->splitk_factor != 1;
+  int64_t tmem_vectorize_factor = getLdTMemVectorizeFactor();
+
+  std::vector<TensorView*> tmem_ld_tvs =
+      !has_splitk ? createTMemLoad() : std::vector<TensorView*>{};
+
+  // Propagate to (not including) the splitk output if there is a splitk
+  // else this is just mma_results_
+  std::vector<TensorView*> register_tvs;
+  std::vector<TensorView*> propagate_to =
+      splitk_sums_.empty() ? mma_results_ : splitk_sums_;
+  for (auto& [c, c_cache] : cached_epilogue_inputs_) {
+    bool is_2d_epilogue_input =
+        TensorDomain::noBroadcasts(c_cache->domain()->logical()).size() == 2;
+    if (is_2d_epilogue_input && params_->async_gmem_load_operands) {
+      // Schedule TMA load into shared memory for epilogue input
+      c_cache->definition()->as<LoadStoreOp>()->setOpType(
+          LoadStoreOpType::CpAsyncBulkTensorTile);
+      c_cache->setMemoryType(MemoryType::Shared);
+      blockTileTensors({c_cache});
+      parallelizeBlocks({c_cache});
+      transformLikeMmaOutputWithoutK(c_cache);
+      c_cache->setAllocationDomain(c_cache->getLoopDomain(), true);
+      for (int64_t i = -5; i <= -1; i++) {
+        c_cache->axis(i)->parallelize(ParallelType::Bulk);
+      }
+
+      // Schedule smem->register load for epilogue input
+      TensorView* reg_tv = cacheAfter(c_cache);
+      register_tvs.push_back(reg_tv);
+      blockTileTensors({reg_tv});
+      parallelizeBlocks({reg_tv});
+      transformLikeMmaOutputWithoutK(reg_tv);
+    }
+    // Propagate changes to the cache_after tensor
+    propagate_to.push_back(c);
+  }
+
+  // TMem load is scheduled separately, so don't propagate to it.
+  propagate_to.insert(
+      propagate_to.end(), tmem_ld_tvs.begin(), tmem_ld_tvs.end());
+
+  // The chain of operations storing data to global memory:
+  //   dc (registers) -> d_smem -> [tma_store] -> d (gmem)
+  // We schedule d_smem and propagate it back.
+  for (Val* dv : fusion_->outputs()) {
+    TensorView* d = dv->as<TensorView>();
+    NVF_ERROR(d->definition() && d->definition()->isA<LoadStoreOp>());
+    TensorView* dc = d->definition()->input(0)->as<TensorView>();
+    TensorView* d_smem = cacheBefore(d, LoadStoreOpType::Set);
+    dc->setMemoryType(MemoryType::Local);
+    d_smem->setMemoryType(MemoryType::Shared);
+
+    // We schedule the epilogue like:
+    // (v = tmem_vectorize_factor, vv = smem_vectorize_factor
+    // [..., Mo * No, Mw, Nw, Mi (TIDx), Ni / v, v/vv, vv]
+    blockTileTensors({d, d_smem});
+    parallelizeBlocks({d, d_smem});
+    for (auto tv : {d, d_smem}) {
+      transformLikeMmaOutputWithoutK(tv);
+      tv->axis(-2)->parallelize(ParallelType::TIDx);
+      if (tmem_vectorize_factor < getN(params_->mma_macro)) {
+        tv->split(-1, tmem_vectorize_factor);
+      }
+    }
+    if (tmem_vectorize_factor > hardcoded_smem_vectorize_factor) {
+      d_smem->split(-1, hardcoded_smem_vectorize_factor);
+    }
+
+    scheduler_utils::BoundedDirectionalTransformPropagator::backward(
+        d_smem,
+        -1,
+        propagate_to,
+        scheduler_utils::BoundedDirectionalTransformPropagator::Options()
+            .propagateParallelType());
+
+    d_smem->axis(-1)->parallelize(ParallelType::Vectorize);
+    d_smem->setAllocationDomain(d_smem->getLoopDomain(), true);
+
+    // Schedule global memory output; Output from TMA Store
+    d->definition()->as<LoadStoreOp>()->setOpType(
+        LoadStoreOpType::CpAsyncBulkTensorTile);
+    for (int64_t i = -5; i <= -1; i++) {
+      d->axis(i)->parallelize(ParallelType::Bulk);
+    }
+  }
+
+  // Schedule TMem load as:
+  // (v = tmem_vectorize_factor)
+  // [..., Mo * No, Mw, Nw, Mi (TIDx), Ni / v, v (Vectorize)]
+  blockTileTensors(tmem_ld_tvs);
+  parallelizeBlocks(tmem_ld_tvs);
+  for (TensorView* tmem_ld_tv : tmem_ld_tvs) {
+    transformLikeMmaOutputWithoutK(tmem_ld_tv);
+    tmem_ld_tv->axis(-2)->parallelize(ParallelType::TIDx);
+    if (tmem_vectorize_factor < getN(params_->mma_macro)) {
+      tmem_ld_tv->split(-1, tmem_vectorize_factor);
+    }
+    tmem_ld_tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  }
+}
+
+void HopperPlus::scheduleEpilogueWithSmemEpilogue() {
+  if (isHopper(params_->mma_macro)) {
+    scheduleEpilogueWithSmemEpilogueHopper();
+  } else {
+    scheduleEpilogueWithSmemEpilogueBlackwell();
+  }
+}
+
 void HopperPlus::scheduleEpilogue() {
   if (params_->use_smem_epilogue) {
     scheduleEpilogueWithSmemEpilogue();
@@ -954,8 +1065,6 @@ void HopperPlus::scheduleSplitKSumHopper() {
   }
 }
 
-constexpr int64_t hardcoded_blackwell_splitk_vectorization_factor = 4;
-
 // Schedule TMem load tv and splitk_sum tv as follows:
 //   v = vectorization factor for TMem load
 //   vv = vectorization factor for splitk_sum, hardcoded to 4

diff --git a/csrc/scheduler/matmul_hopper+.h b/csrc/scheduler/matmul_hopper+.h
@@ -183,6 +183,8 @@ class HopperPlus : public Common {
   void scheduleEpilogueWithoutSmemEpilogueHopper();
   void scheduleEpilogueWithoutSmemEpilogueBlackwell();
   void scheduleEpilogueWithoutSmemEpilogue();
+  void scheduleEpilogueWithSmemEpilogueHopper();
+  void scheduleEpilogueWithSmemEpilogueBlackwell();
   void scheduleEpilogueWithSmemEpilogue();
   void scheduleEpilogue();
 

diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
@@ -465,8 +465,9 @@ TensorView* TensorView::split(int64_t axis, Val* factor, bool inner_split) {
 
   NVF_CHECK(
       this->axis(axis)->getParallelType() == ParallelType::Serial,
-      "Splitting an axis of non-Serial parallel type is not supported at this "
-      "time."
+      "Splitting an axis (",
+      this->axis(axis)->toString(),
+      ") of non-Serial parallel type is not supported at this time."
       " Parallelization strategy must be set after calling split: ",
       toString());
 

diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp
@@ -3330,9 +3330,6 @@ class HopperPlusMatmulSchedulerTest
       NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
     } else {
       NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(10, 0, 11, 0);
-      if (use_smem_epilogue) {
-        GTEST_SKIP() << "TMA store is not supported for Blackwell yet.";
-      }
     }
 
     if (a_k_inner) {