Add bindings for FusionExecutorCache (#4513)

rdspring1 · web-flow · commit 8b70ef2f199f · 2025-05-28T15:13:41.000-07:00
This PR creates the bindings for `FusionExecutorCache`, allowing fusions to run CUDA kernels in `nvfuser_direct`. Functions bound for `FusionExecutorCache`: * get_cuda_kernel * get_most_recent_scheduled_ir * get_scheduled_ir * is_compiled * execute Create `python/python_direct/direct_utils.h` for `python_direct`-only helper functions * Add `from_pyiterable` and `to_tensor_vector` to and from `at::Tensor` and `KernelArgumentHolder` New function for python `FusionDefinition`: * `execute` - It creates FusionExecutorCache if it exists and runs the fusion with given input arguments. Testing * Created `test_fusion_execution_cache` and `test_define_tensor` PR Stack: #4409 Create python FusionDefinition for nvfuser_next #4513 Add bindings for FusionExecutorCache **<<< This PR.** #4516 Add the remaining binary operations #4517 Add the bindings for unary operations #4518 Add the bindings for reduction operations #4519 Move helper functions from python_frontend to python_common #4520 Create python reproducer from Fusion IR for nvfuser_direct #4521 Recreate python_frontend test_basic for nvfuser_direct
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -572,6 +572,7 @@ if(BUILD_PYTHON)
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/ir.cpp
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/ops.cpp
     ${NVFUSER_PYTHON_DIRECT_BINDINGS}/runtime.cpp
+    ${NVFUSER_PYTHON_DIRECT_BINDINGS}/direct_utils.cpp
   )
   add_library(nvf_py_direct_internal OBJECT ${NVFUSER_PYTHON_DIRECT_SRCS})
 
diff --git a/python/nvfuser_direct/__init__.py b/python/nvfuser_direct/__init__.py
@@ -10,6 +10,7 @@
 
 import os
 import torch
+import traceback
 
 # This is needed when libnvfuser_direct.so is patched and doesn't have the pytorch library location available.
 pytorch_lib_dir = os.path.join(os.path.dirname(torch.__file__), "lib")
@@ -42,25 +43,35 @@ def __init__(self):
         # Monkey patching nvfuser_direct.ops submodule to mimic python_frontend
         # FusionDefinition.ops API. This is to maintain backwards compatibilty.
         self.ops = _C_DIRECT.ops
-        self.fusion = _C_DIRECT.Fusion()
-        self.fusion_guard = None
+        self._fusion = None
+        self._fusion_guard = None
+
+    @property
+    def fusion(self):
+        if not hasattr(self, "fec"):
+            return self._fusion
+        else:
+            return self.fec.fusion()
 
     def __enter__(self):
         """
         Enter the context manager.
+
         Returns
         -------
         FusionDefinition
             The FusionDefinition instance
         """
-        self.fusion_guard = _C_DIRECT.FusionGuard(self.fusion)
+        self._fusion = _C_DIRECT.Fusion()
+        self._fusion_guard = _C_DIRECT.FusionGuard(self._fusion)
         return self
 
     def __exit__(self, exception_type, exception_value, exception_traceback):
         """
         Exit the context manager and handle any exceptions.
         This method is called when exiting the 'with' block, whether normally or due to an exception.
         The arguments provide information about any exception that occurred:
+
         Parameters
         ----------
         excecption_type : type or None
@@ -73,7 +84,7 @@ def __exit__(self, exception_type, exception_value, exception_traceback):
             The traceback object containing the call stack.
             None if no exception occurred.
         """
-        self.fusion_guard = None
+        del self._fusion_guard
         if exception_type is not None:
             print(f"Exception occurred: {exception_type.__name__}: {exception_value}")
             if exception_traceback is not None:
@@ -84,29 +95,60 @@ def __exit__(self, exception_type, exception_value, exception_traceback):
     def define_tensor(self, *args, **kwargs):
         """
         Define a new tensor input for the fusion.
+
         Parameters
         ----------
         *args
             Positional arguments passed to _C_DIRECT.define_tensor
         **kwargs
             Keyword arguments passed to _C_DIRECT.define_tensor
+
         Returns
         -------
         Tensor
             The defined tensor
         """
         tv = _C_DIRECT.define_tensor(*args, **kwargs)
-        self.fusion.add_input(tv)
+        self._fusion.add_input(tv)
         return tv
 
     def add_output(self, *args, **kwargs):
         """
         Add an output to the fusion.
+
         Parameters
         ----------
         *args
             Positional arguments passed to fusion.add_output
         **kwargs
             Keyword arguments passed to fusion.add_output
         """
-        self.fusion.add_output(*args, **kwargs)
+        self._fusion.add_output(*args, **kwargs)
+
+    def execute(self, inputs, *, device=None, auto_schedule=True) -> list[torch.Tensor]:
+        """
+        Execute the fusion with the given inputs.
+
+        Parameters
+        ----------
+        inputs : list of torch.Tensor
+            Input tensors and scalars to the fusion
+        device : torch.device, optional
+            Device to execute the fusion on
+        auto_schedule : bool, default=True
+            Whether to use automatic scheduling
+
+        Returns
+        -------
+        list of torch.Tensor
+            Output tensors from the fusion
+        """
+        if auto_schedule:
+            if not hasattr(self, "fec"):
+                self.fec = _C_DIRECT.FusionExecutorCache(self._fusion)
+                # A copy of fusion is created after construction FusionExecutorCache
+                # Delete the _fusion and reference the fusion inside FusionExecutorCache
+                del self._fusion
+            return self.fec.execute(inputs)
+        else:
+            raise RuntimeError("Manual scheduling is not supported yet.")
diff --git a/python/python_direct/direct_utils.cpp b/python/python_direct/direct_utils.cpp
@@ -0,0 +1,51 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include <direct_utils.h>
+#include <algorithm>
+
+namespace nvfuser::python {
+
+KernelArgumentHolder from_pyiterable(
+    const py::iterable& iter,
+    std::optional<int64_t> device) {
+  KernelArgumentHolder args;
+  for (py::handle obj : iter) {
+    // Allows for a Vector of Sizes to be inputed as a list/tuple
+    if (py::isinstance<py::list>(obj) || py::isinstance<py::tuple>(obj)) {
+      for (py::handle item : obj) {
+        args.push(torch::jit::toIValue(item, c10::AnyType::get()));
+      }
+    } else {
+      args.push(torch::jit::toIValue(obj, c10::AnyType::get()));
+    }
+  }
+
+  // Transform int64_t device to int8_t
+  std::optional<int8_t> selected_device = std::nullopt;
+  if (device.has_value()) {
+    NVF_CHECK(device.value() < 256, "Maximum device index is 255");
+    selected_device = (int8_t)device.value();
+  }
+  args.setDeviceIndex(selected_device);
+  return args;
+}
+
+std::vector<at::Tensor> to_tensor_vector(const KernelArgumentHolder& outputs) {
+  // Convert outputs KernelArgumentHolder to std::vector<at::Tensor>
+  std::vector<at::Tensor> out_tensors;
+  out_tensors.reserve(outputs.size());
+  std::transform(
+      outputs.begin(),
+      outputs.end(),
+      std::back_inserter(out_tensors),
+      [](const PolymorphicValue& out) { return out.as<at::Tensor>(); });
+  return out_tensors;
+}
+
+} // namespace nvfuser::python
diff --git a/python/python_direct/direct_utils.h b/python/python_direct/direct_utils.h
@@ -0,0 +1,26 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <runtime/executor_kernel_arg.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <optional>
+#include <vector>
+
+namespace nvfuser::python {
+
+// Convert a py::iterable to a KernelArgumentHolder
+nvfuser::KernelArgumentHolder from_pyiterable(
+    const py::iterable& iter,
+    std::optional<int64_t> device = std::nullopt);
+
+// Convert a KernelArgumentHolder to a std::vector<at::Tensor>
+std::vector<at::Tensor> to_tensor_vector(
+    const nvfuser::KernelArgumentHolder& outputs);
+
+} // namespace nvfuser::python
diff --git a/python/python_direct/runtime.cpp b/python/python_direct/runtime.cpp
diff --git a/tests/python_direct/test_python_direct.py b/tests/python_direct/test_python_direct.py

Original file line number	Diff line number	Diff line change
`@@ -572,6 +572,7 @@ if(BUILD_PYTHON)`
`572`	`572`	`${NVFUSER_PYTHON_DIRECT_BINDINGS}/ir.cpp`
`573`	`573`	`${NVFUSER_PYTHON_DIRECT_BINDINGS}/ops.cpp`
`574`	`574`	`${NVFUSER_PYTHON_DIRECT_BINDINGS}/runtime.cpp`
	`575`	`+ ${NVFUSER_PYTHON_DIRECT_BINDINGS}/direct_utils.cpp`
`575`	`576`	`)`
`576`	`577`	`add_library(nvf_py_direct_internal OBJECT ${NVFUSER_PYTHON_DIRECT_SRCS})`
`577`	`578`