ml-explore
diff --git a/‎CMakeLists.txt
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎mlx/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎mlx/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/CMakeLists.txt
Lines changed: 49 additions & 0 deletions b/‎mlx/backend/cuda/CMakeLists.txt
Lines changed: 49 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/allocator.cpp
Lines changed: 103 additions & 0 deletions b/‎mlx/backend/cuda/allocator.cpp
Lines changed: 103 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/allocator.h
Lines changed: 53 additions & 0 deletions b/‎mlx/backend/cuda/allocator.h
Lines changed: 53 additions & 0 deletions
@@ -34,6 +34,7 @@ option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
+option(MLX_BUILD_CUDA "Build cuda backend" OFF)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
@@ -83,6 +84,10 @@ if(MLX_BUILD_METAL)
   set(QUARTZ_LIB "-framework QuartzCore")
 endif()
 
+if(MLX_BUILD_CUDA)
+  enable_language(CUDA)
+endif()
+
 if(MLX_BUILD_METAL AND NOT METAL_LIB)
   message(STATUS "Metal not found. Unable to build GPU")
   set(MLX_BUILD_METAL OFF)
 
@@ -5,6 +5,7 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/dtype_utils.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
@@ -47,6 +48,8 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 
 if(MLX_BUILD_METAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
+elseif(MLX_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
 else()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_metal)
 endif()
@@ -0,0 +1,49 @@
+# Filename rules in CUDA backend:
+#
+# * Use .cu/.cuh if code contains device code, and .cpp/.h if not.
+# * Device-only kernel code should be put in kernels/ subdir.
+# * Files in kernels/ subdir should not include files outside.
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/metal.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)
+
+target_compile_definitions(mlx PUBLIC MLX_USE_CUDA)
+
+# Enable defining device lambda functions.
+target_compile_options(mlx
+                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")
+
+set_target_properties(
+  mlx
+  PROPERTIES CUDA_STANDARD 17
+             CUDA_SEPARABLE_COMPILATION ON
+             # Compute capability 7 is required for synchronization between
+             # CPU/GPU with managed memory.
+             # TODO: Add more architectures for release build.
+             CUDA_ARCHITECTURES "70;80")
+
+# Use fixed version of CCCL.
+FetchContent_Declare(
+  cccl
+  URL "https://github.com/NVIDIA/cccl/releases/download/v2.8.1/cccl-v2.8.1.zip")
+FetchContent_MakeAvailable(cccl)
+target_include_directories(mlx PRIVATE BEFORE "${cccl_SOURCE_DIR}/include")
+
+# Make CUDA APIs visible in .cpp files.
+find_package(CUDAToolkit REQUIRED)
+target_include_directories(mlx PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+# Suppress nvcc warnings on MLX headers.
+target_compile_options(mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe
+                                   --diag_suppress=997>)
@@ -0,0 +1,103 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/utils.h"
+
+#include <cuda_runtime.h>
+#include <fmt/format.h>
+
+namespace mlx::core {
+
+namespace mxcuda {
+
+CudaAllocator::CudaAllocator() {
+  size_t free, total;
+  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total));
+  memory_limit_ = total * 0.8;
+}
+
+Buffer CudaAllocator::malloc(size_t size) {
+  // TODO: Check memory limit.
+  auto* buf = new CudaBuffer{nullptr, size};
+  cudaError_t err = cudaMallocManaged(&buf->data, size);
+  if (err != cudaSuccess && err != cudaErrorMemoryAllocation) {
+    throw std::runtime_error(
+        fmt::format("cudaMallocManaged failed: {}", cudaGetErrorString(err)));
+  }
+  active_memory_ += size;
+  peak_memory_ = std::max(active_memory_, peak_memory_);
+  return Buffer{buf};
+}
+
+void CudaAllocator::free(Buffer buffer) {
+  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
+  if (!buf) {
+    return;
+  }
+  active_memory_ -= buf->size;
+  cudaFree(buf->data);
+  delete buf;
+}
+
+size_t CudaAllocator::size(Buffer buffer) const {
+  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
+  if (!buf) {
+    return 0;
+  }
+  return static_cast<CudaBuffer*>(buffer.ptr())->size;
+}
+
+CudaAllocator& allocator() {
+  // By creating the |allocator_| on heap, the destructor of CudaAllocator
+  // will not be called on exit and buffers in the cache will be leaked. This
+  // can save some time at program exit.
+  static CudaAllocator* allocator_ = new CudaAllocator;
+  return *allocator_;
+}
+
+} // namespace mxcuda
+
+namespace allocator {
+
+Allocator& allocator() {
+  return mxcuda::allocator();
+}
+
+void* Buffer::raw_ptr() {
+  if (!ptr_) {
+    return nullptr;
+  }
+  return static_cast<mxcuda::CudaBuffer*>(ptr_)->data;
+}
+
+} // namespace allocator
+
+size_t get_active_memory() {
+  return mxcuda::allocator().get_active_memory();
+}
+size_t get_peak_memory() {
+  return mxcuda::allocator().get_peak_memory();
+}
+void reset_peak_memory() {
+  return mxcuda::allocator().reset_peak_memory();
+}
+size_t set_memory_limit(size_t limit) {
+  return mxcuda::allocator().set_memory_limit(limit);
+}
+size_t get_memory_limit() {
+  return mxcuda::allocator().get_memory_limit();
+}
+
+// No-ops for common allocator
+size_t get_cache_memory() {
+  return 0;
+}
+size_t set_cache_limit(size_t) {
+  return 0;
+}
+size_t set_wired_limit(size_t) {
+  return 0;
+}
+void clear_cache() {}
+
+} // namespace mlx::core
@@ -0,0 +1,53 @@
+// Copyright © 2025 Apple Inc.
+
+#pragma once
+
+#include "mlx/allocator.h"
+
+#include <utility>
+
+namespace mlx::core::mxcuda {
+
+using allocator::Buffer;
+
+// Stores cuda-managed memory.
+struct CudaBuffer {
+  void* data;
+  size_t size;
+};
+
+class CudaAllocator : public allocator::Allocator {
+ public:
+  Buffer malloc(size_t size) override;
+  void free(Buffer buffer) override;
+  size_t size(Buffer buffer) const override;
+
+  size_t get_active_memory() const {
+    return active_memory_;
+  };
+  size_t get_peak_memory() const {
+    return peak_memory_;
+  };
+  void reset_peak_memory() {
+    peak_memory_ = 0;
+  };
+  size_t get_memory_limit() {
+    return memory_limit_;
+  }
+  size_t set_memory_limit(size_t limit) {
+    std::swap(memory_limit_, limit);
+    return limit;
+  }
+
+ private:
+  CudaAllocator();
+  friend CudaAllocator& allocator();
+
+  size_t memory_limit_;
+  size_t active_memory_{0};
+  size_t peak_memory_{0};
+};
+
+CudaAllocator& allocator();
+
+} // namespace mlx::core::mxcuda