diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir index aa11773defdb1..2ad39405cc06f 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir @@ -1,25 +1,11 @@ // RUN: mlir-opt %s \ -// RUN: -convert-nvgpu-to-nvvm \ -// RUN: -gpu-kernel-outlining \ -// RUN: -convert-vector-to-scf \ -// RUN: -convert-scf-to-cf \ -// RUN: -convert-nvvm-to-llvm \ -// RUN: -convert-vector-to-llvm \ -// RUN: -convert-index-to-llvm=index-bitwidth=32 \ -// RUN: -convert-arith-to-llvm \ -// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \ -// RUN: -convert-func-to-llvm \ -// RUN: -canonicalize -cse \ -// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \ +// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s - // Test swizzling with TMA load // 128B Swizzle Each numbered cell is 16 byte // |-------------------------------| diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir index 5c465f7de8abd..242c5ff875cf4 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir @@ -1,19 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: -convert-nvgpu-to-nvvm \ -// RUN: -canonicalize -cse \ -// RUN: -gpu-kernel-outlining \ -// RUN: -convert-vector-to-scf \ -// RUN: -convert-scf-to-cf \ -// RUN: -convert-nvvm-to-llvm \ -// RUN: -convert-vector-to-llvm \ -// RUN: -convert-index-to-llvm=index-bitwidth=32 \ -// RUN: -convert-arith-to-llvm \ -// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \ -// RUN: -convert-func-to-llvm \ -// RUN: -canonicalize -cse \ -// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \ +// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir index 5331ebb87d37d..44b127bd409ba 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir @@ -1,16 +1,10 @@ -// RUN: mlir-opt %s --convert-nvgpu-to-nvvm \ -// RUN: -gpu-kernel-outlining \ -// RUN: -convert-nvvm-to-llvm \ -// RUN: -convert-scf-to-cf \ -// RUN: -convert-vector-to-llvm \ -// RUN: -convert-index-to-llvm=index-bitwidth=32 \ -// RUN: -convert-arith-to-llvm \ -// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \ -// RUN: -convert-func-to-llvm \ -// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \ -// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX +// RUN: mlir-opt %s \ +// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s // Basic PTX check to make sure we are generating the right instructions. diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp index 174d27b0da8a1..b5af22f23a77c 100644 --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -20,6 +20,7 @@ #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h" +#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h" #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" @@ -143,11 +144,6 @@ void buildGpuPassPipeline(OpPassManager &pm, pm.addNestedPass( createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions)); - // TODO: C++20 designated initializers. - ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; - convertNVGPUToNVVMPassOptions.useOpaquePointers = true; - pm.addNestedPass( - createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); pm.addNestedPass(createConvertSCFToCFPass()); // Convert vector to LLVM (always needed). @@ -157,6 +153,9 @@ void buildGpuPassPipeline(OpPassManager &pm, pm.addNestedPass( createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + // This pass is needed for PTX building + pm.addNestedPass(createConvertNVVMToLLVMPass()); + // Sprinkle some cleanups. pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); @@ -167,6 +166,20 @@ void buildGpuPassPipeline(OpPassManager &pm, void buildLowerToNVVMPassPipeline(OpPassManager &pm, const TestLowerToNVVMOptions &options) { + // Start with a cleanup pass. + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + + //===----------------------------------------------------------------------===// + // NVGPU lowers device code as well as host code to the driver, so must run + // before outlining. + //===----------------------------------------------------------------------===// + // TODO: C++20 designated initializers. + ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; + convertNVGPUToNVVMPassOptions.useOpaquePointers = true; + pm.addNestedPass( + createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); + //===----------------------------------------------------------------------===// // Host-specific stuff. //===----------------------------------------------------------------------===//