PR #24114: Triton/Nvidia: Fix fused fp8 <-> fp8 conversions

kasper0406 · Google-ML-Automation · commit 06ec46f3ad43 · 2025-03-27T08:32:00.000-07:00
Imported from GitHub PR #24114 Converting FP8 <-> FP8 fails because the Triton compiler does not support it. The proposed fix will make the conversion go through FP16. Two questions: 1) Are there any better approaches of solving this? 2) I could not find a place to put unit tests for this, and in the code there is a comment saying: ``` // TODO(b/266862493): Add end-to-end test once FP8 support lands in XLA as // we can't test the code below without patching the feature. ``` Wondering if there is a place where I can add a test? ### Details When converting FP8 types, the XLA compiler emits a `fp_to_fp` Triton instruction. If the source type is FP8, no rounding strategy is specified. Concretely, this causes the following Triton to be emitted: <details> <summary> <code>%24 = tt.fp_to_fp %20 : tensor<32x64xf8E5M2> -> tensor<32x64xf8E4M3FN></code> </summary> ``` module { tt.func @gemm_fusion_dot_320_impl(%arg0: !tt.ptr<f8E4M3FN> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f8E4M3FN> {tt.divisibility = 16 : i32}) { %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf8E4M3FN> %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x64xf8E4M3FN> %c90_i32 = arith.constant 90 : i32 %c32000_i64 = arith.constant 32000 : i64 %c64_i32 = arith.constant 64 : i32 %c90_i64 = arith.constant 90 : i64 %c768_i64 = arith.constant 768 : i64 %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 %c32_i32 = arith.constant 32 : i32 %c24_i32 = arith.constant 24 : i32 %c8_i32 = arith.constant 8 : i32 %c4000_i32 = arith.constant 4000 : i32 %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x64xf32> %0 = tt.get_program_id x : i32 %1 = arith.divsi %0, %c4000_i32 : i32 %2 = arith.muli %1, %c8_i32 : i32 %3 = arith.subi %c24_i32, %2 : i32 %4 = arith.cmpi slt, %3, %c8_i32 : i32 %5 = arith.select %4, %3, %c8_i32 : i32 %6 = arith.remsi %0, %5 : i32 %7 = arith.addi %2, %6 : i32 %8 = arith.remsi %0, %c4000_i32 : i32 %9 = arith.divsi %8, %5 : i32 %10 = arith.muli %7, %c32_i32 : i32 %11 = tt.make_tensor_ptr %arg1, [%c768_i64, %c90_i64], [%c1_i64, %c768_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf8E5M2>> %12 = tt.advance %11, [%10, %c0_i32] : <tensor<32x64xf8E5M2>> %13 = arith.muli %9, %c64_i32 : i32 %14 = tt.make_tensor_ptr %arg0, [%c90_i64, %c32000_i64], [%c1_i64, %c90_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xf8E4M3FN>> %15 = tt.advance %14, [%c0_i32, %13] : <tensor<64x64xf8E4M3FN>> %16:3 = scf.for %arg3 = %c0_i32 to %c90_i32 step %c64_i32 iter_args(%arg4 = %12, %arg5 = %15, %arg6 = %cst_1) -> (!tt.ptr<tensor<32x64xf8E5M2>>, !tt.ptr<tensor<64x64xf8E4M3FN>>, tensor<32x64xf32>) : i32 { %20 = tt.load %arg4 {boundaryCheck = array<i32: 1>, padding = 1 : i32} : !tt.ptr<tensor<32x64xf8E5M2>> %21 = tt.advance %arg4, [%c0_i32, %c64_i32] : <tensor<32x64xf8E5M2>> %22 = tt.load %arg5 {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<64x64xf8E4M3FN>> %23 = tt.advance %arg5, [%c64_i32, %c0_i32] : <tensor<64x64xf8E4M3FN>> %24 = tt.fp_to_fp %20 : tensor<32x64xf8E5M2> -> tensor<32x64xf8E4M3FN> %25 = arith.subi %c90_i32, %arg3 : i32 %26 = arith.cmpi slt, %25, %c64_i32 : i32 %27 = scf.if %26 -> (tensor<32x64xf8E4M3FN>) { %30 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %31 = tt.expand_dims %30 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %32 = tt.splat %25 : i32 -> tensor<1x64xi32> %33 = arith.cmpi slt, %31, %32 : tensor<1x64xi32> %34 = tt.broadcast %33 : tensor<1x64xi1> -> tensor<32x64xi1> %35 = arith.select %34, %24, %cst_0 : tensor<32x64xi1>, tensor<32x64xf8E4M3FN> scf.yield %35 : tensor<32x64xf8E4M3FN> } else { scf.yield %24 : tensor<32x64xf8E4M3FN> } %28 = scf.if %26 -> (tensor<64x64xf8E4M3FN>) { %30 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %31 = tt.expand_dims %30 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %32 = tt.splat %25 : i32 -> tensor<64x1xi32> %33 = arith.cmpi slt, %31, %32 : tensor<64x1xi32> %34 = tt.broadcast %33 : tensor<64x1xi1> -> tensor<64x64xi1> %35 = arith.select %34, %22, %cst : tensor<64x64xi1>, tensor<64x64xf8E4M3FN> scf.yield %35 : tensor<64x64xf8E4M3FN> } else { scf.yield %22 : tensor<64x64xf8E4M3FN> } %29 = tt.dot %27, %28, %arg6, inputPrecision = tf32 {maxNumImpreciseAcc = 2147483647 : i32} : tensor<32x64xf8E4M3FN> * tensor<64x64xf8E4M3FN> -> tensor<32x64xf32> scf.yield %21, %23, %29 : !tt.ptr<tensor<32x64xf8E5M2>>, !tt.ptr<tensor<64x64xf8E4M3FN>>, tensor<32x64xf32> } %17 = tt.fp_to_fp %16#2, rounding = rtne : tensor<32x64xf32> -> tensor<32x64xf8E4M3FN> %18 = tt.make_tensor_ptr %arg2, [%c768_i64, %c32000_i64], [%c1_i64, %c768_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xf8E4M3FN>> %19 = tt.advance %18, [%10, %13] : <tensor<32x64xf8E4M3FN>> tt.store %19, %17 : !tt.ptr<tensor<32x64xf8E4M3FN>> tt.return } } ``` </details> Which leads to a failing assertion: ``` #0 0x000073413786d9fc in pthread_kill () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x0000734137819476 in raise () from /lib/x86_64-linux-gnu/libc.so.6 #2 0x00007341377ff7f3 in abort () from /lib/x86_64-linux-gnu/libc.so.6 #3 0x00007341377ff71b in ?? () from /lib/x86_64-linux-gnu/libc.so.6 #4 0x0000734137810e96 in __assert_fail () from /lib/x86_64-linux-gnu/libc.so.6 #5 0x000057d936b1777b in mlir::triton::gpu::(anonymous namespace)::FpToFpOpConversion::createDestOps (this=0x733d08425cc0, op=..., adaptor=..., rewriter=..., elemTy=..., operands=..., loc=...) at external/triton/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp:500 #6 0x000057d936b17195 in mlir::triton::gpu::ElementwiseOpConversionBase<mlir::triton::FpToFpOp, mlir::triton::gpu::(anonymous namespace)::FpToFpOpConversion>::matchAndRewrite (this=0x733d08425cc0, op=..., adaptor=..., rewriter=...) at external/triton/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h:188 [...] #29 0x000057d93fa6cade in mlir::PassManager::run (this=0x733e80fba158, op=0x733d080bbc20) at external/llvm-project/mlir/lib/Pass/Pass.cpp:885 #30 0x000057d9363f6b1b in xla::gpu::CompileTritonToLLVM (hlo_config=..., hlo_module_name="gemm_fusion_dot.320", device_info=..., block_level_parameters=..., triton_module=..., llvm_module=0x733d0816d6a0, mlir_context=..., is_xla_fusion=true, emit_kernel=true) at xla/backends/gpu/codegen/triton/fusion_emitter.cc:1627 #31 0x000057d9363f5a5d in xla::gpu::TritonWrapper (fn_name="gemm_fusion_dot_320_impl", fusion=0x733d080a31c0, cc=std::variant<stream_executor::CudaComputeCapability, stream_executor::RocmComputeCapability> [index 0] = {...}, device_info=..., block_level_parameters=..., llvm_module=0x733d0816d6a0, mlir_context=...) at xla/backends/gpu/codegen/triton/fusion_emitter.cc:1531 ``` However, this fails Triton compilation: * First it hits an assertion that the rounding strategy when the destination type is FP8 must be specified * Adding the rounding strategy, then goes on to another issue, that no methods for converting FP8 <-> FP8 are specified To work around the above two issues, I propose going through FP16 when both the source and destination types are FP8's. Copybara import of the project: -- afd3929 by Kasper Nielsen <kasper0406@gmail.com>: Fix fused fp8 <-> fp8 conversions -- 66340aa by Kasper Nielsen <kasper0406@gmail.com>: Add unit tests and refactor duplicated code -- 07ae307 by Kasper Nielsen <kasper0406@gmail.com>: Run clang-format Merging this change closes #24114 FUTURE_COPYBARA_INTEGRATE_REVIEW=#24114 from kasper0406:kn/fp8-conversion-fix 07ae307 PiperOrigin-RevId: 741162069
diff --git a/xla/backends/gpu/codegen/triton/emitter_helpers.cc b/xla/backends/gpu/codegen/triton/emitter_helpers.cc
@@ -126,11 +126,13 @@ bool IsFp8Type(Type t) {
 Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
   Type src_ty = value.getType();
   Type src_element_ty = src_ty;
+  Type fp16_ty = b.getF16Type();
   Type fp32_ty = b.getF32Type();
   Type dst_ty = dst_element_ty;
   if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
     src_element_ty = src_shaped_ty.getElementType();
     dst_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), dst_element_ty);
+    fp16_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF16Type());
     fp32_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF32Type());
   }
   if (src_ty == dst_ty) {
@@ -156,14 +158,21 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
     // because LLVM doesn't support casts from/to FP8.
     // TODO(b/266862493): Add end-to-end test once FP8 support lands in XLA as
     // we can't test the code below without patching the feature.
-    if (IsFp8Type(src_element_ty)) {
+    if (IsFp8Type(src_element_ty) && !IsFp8Type(dst_element_ty)) {
       return b.create<mt::FpToFpOp>(dst_ty, value);
     }
-    if (IsFp8Type(dst_element_ty)) {
+    if (IsFp8Type(dst_element_ty) && !IsFp8Type(src_element_ty)) {
       return b.create<mt::FpToFpOp>(
           dst_ty, value,
           mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
     }
+    if (IsFp8Type(src_element_ty) && IsFp8Type(dst_element_ty)) {
+      // FP8 <-> FP8 conversion needs to go through FP16
+      auto fp16_value = b.create<mt::FpToFpOp>(fp16_ty, value);
+      return b.create<mt::FpToFpOp>(
+          dst_ty, fp16_value,
+          mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
+    }
 
     if (src_fp_element_ty.getFPMantissaWidth() >
         dst_fp_element_ty.getFPMantissaWidth()) {
diff --git a/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc b/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
@@ -4202,6 +4202,36 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
 }
 
+TEST_F(TritonTest, FP8ToFP8EndToEnd) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
+  }
+
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
+  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
+  convert = f8e4m3fn[32,32]{1,0} convert(parameter_0)
+  ROOT dot = f32[32,32]{1,0} dot(convert, parameter_1),
+                lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
+  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
+  ROOT gemm_fusion_dot = f32[32,32]{1,0} fusion(parameter_0, parameter_1),
+       kind=kCustom, calls=triton_dot,
+       backend_config={
+       "fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":
+         {"block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+          "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
+}
+
 // Test PreventMmaV3LoopUnrolling pass in order to keep compile time low.
 // See b/344841434.
 TEST_F(TritonGemmTest, TestPreventMMAV3LoopUnrolling) {
diff --git a/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -1806,6 +1806,40 @@ ENTRY entry_computation {
   EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), kExactMatch));
 }
 
+TEST_F(TritonEmitterTest, FP8ToFP8EndToEnd) {
+  if (auto cc =
+          std::get_if<se::CudaComputeCapability>(&GpuComputeCapability())) {
+    if (!cc->IsAtLeastHopper()) {
+      GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
+    }
+  }
+
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
+  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
+  convert = f8e4m3fn[32,32]{1,0} convert(parameter_0)
+  ROOT dot = f32[32,32]{1,0} dot(convert, parameter_1),
+                lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
+  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
+  ROOT gemm_fusion_dot = f32[32,32]{1,0} fusion(parameter_0, parameter_1),
+       kind=kCustom, calls=triton_dot,
+       backend_config={
+       "fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":
+         {"block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+          "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
+                                       ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
+}
+
 TEST_F(TritonEmitterTest, SingleTileDotWithNestedFusionsIsEmittedCorrectly) {
   // Simplest case when everything fits into one tile that is useful for
   // debugging. This also tests support for empty nested fusions.
diff --git a/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc b/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc
@@ -227,121 +227,6 @@ bool IsFp8Type(Type t) {
                    mlir::Float8E4M3B11FNUZType>(t);
 }
 
-Value Cast(EmitterLocOpBuilder b, Value value, Type dst_element_ty) {
-  Type src_ty = value.getType();
-  Type src_element_ty = src_ty;
-  Type fp32_ty = b.getF32Type();
-  Type dst_ty = dst_element_ty;
-  if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-    src_element_ty = src_shaped_ty.getElementType();
-    dst_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), dst_element_ty);
-    fp32_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF32Type());
-  }
-  if (src_ty == dst_ty) {
-    return value;
-  }
-
-  // All operations on bf16 are done through f32.
-  if (src_element_ty.isBF16()) {
-    return Cast(b, b.create<ma::ExtFOp>(fp32_ty, value), dst_element_ty);
-  }
-  if (dst_element_ty.isBF16()) {
-    // S8 -> BF16 is directly supported and doesn't need to go through f32.
-    if (!src_element_ty.isInteger(8)) {
-      return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
-    }
-  }
-
-  // float => float
-  auto src_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(src_element_ty);
-  auto dst_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(dst_element_ty);
-  if (src_fp_element_ty && dst_fp_element_ty) {
-    // F8 <-> FP16, BF16, FP32, FP64 need to be handled via Triton's tt.fp_to_fp
-    // because LLVM doesn't support casts from/to FP8.
-    // TODO(b/266862493): Add end-to-end test once FP8 support lands in XLA as
-    // we can't test the code below without patching the feature.
-    if (IsFp8Type(src_element_ty)) {
-      return b.create<mt::FpToFpOp>(dst_ty, value);
-    }
-    if (IsFp8Type(dst_element_ty)) {
-      return b.create<mt::FpToFpOp>(
-          dst_ty, value,
-          mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
-    }
-
-    if (src_fp_element_ty.getFPMantissaWidth() >
-        dst_fp_element_ty.getFPMantissaWidth()) {
-      return b.create<ma::TruncFOp>(dst_ty, value);
-    } else {
-      return b.create<ma::ExtFOp>(dst_ty, value);
-    }
-  }
-  // int => int
-  if (mlir::isa<mlir::IntegerType>(src_element_ty) &&
-      mlir::isa<mlir::IntegerType>(dst_element_ty)) {
-    if (src_element_ty.getIntOrFloatBitWidth() <
-        dst_element_ty.getIntOrFloatBitWidth()) {
-      if (src_element_ty.isInteger(1)) {
-        return b.create<ma::ExtUIOp>(dst_ty, value);
-      }
-      return b.create<ma::ExtSIOp>(dst_ty, value);
-    }
-    return b.create<ma::TruncIOp>(dst_ty, value);
-  }
-  // int => float
-  if (mlir::isa<mlir::IntegerType>(src_element_ty) && dst_fp_element_ty) {
-    // TODO(b/266862493): Support unsigned integer types.
-    if (src_element_ty.isInteger(1)) {
-      return b.create<ma::UIToFPOp>(dst_ty, value);
-    }
-    return b.create<ma::SIToFPOp>(dst_ty, value);
-  }
-  // float => int
-  if (src_fp_element_ty && mlir::isa<mlir::IntegerType>(dst_element_ty)) {
-    if (dst_element_ty.isInteger(1)) {
-      return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNE, value,
-                                  ZerosLike(b, value));
-    }
-    // TODO(b/266862493): Support unsigned integer types.
-    // The current logic handles signed integer types only. Additional handling
-    // is needed for unsigned integer types.
-    auto cst_int = [&](EmitterLocOpBuilder b, int64_t x) {
-      if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, dst_element_ty, x, src_shaped_ty.getShape());
-      } else {
-        return CreateConst(b, dst_element_ty, x);
-      }
-    };
-    auto cst_float = [&](EmitterLocOpBuilder b, int64_t x) {
-      if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, src_fp_element_ty, x, src_shaped_ty.getShape());
-      } else {
-        return CreateConst(b, src_fp_element_ty, x);
-      }
-    };
-    auto fptosi = b.create<ma::FPToSIOp>(dst_ty, value);
-    int64_t min = llvm::minIntN(dst_element_ty.getIntOrFloatBitWidth());
-    int64_t max = llvm::maxIntN(dst_element_ty.getIntOrFloatBitWidth());
-
-    // value <= static_cast<float>(INT_MIN) ? INT_MIN : ...
-    auto clamped = b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::OLE, value, cst_float(b, min)),
-        cst_int(b, min), fptosi);
-    // value >= static_cast<float>(INT_MAX) ? INT_MAX : ...
-    clamped = b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::OGE, value, cst_float(b, max)),
-        cst_int(b, max), clamped);
-    // isnan(value) ? 0 : ...
-    return b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::UNO, value, value),
-        cst_int(b, 0), clamped);
-  }
-
-  LOG(FATAL) << "Type conversion not supported: "
-             << llvm_ir::DumpToString(src_element_ty) << " -> "
-             << llvm_ir::DumpToString(dst_element_ty);
-}
-
 Value Subtract(EmitterLocOpBuilder b, ValueRange values) {
   if (mlir::isa<mlir::IntegerType>(mlir::getElementTypeOrSelf(values[0]))) {
     return b.create<ma::SubIOp>(values[0], values[1]);
@@ -448,7 +333,7 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder b,
     case HloOpcode::kConvert: {
       TF_ASSIGN_OR_RETURN(Type dst_ty,
                           TritonType(b, hlo.shape().element_type()));
-      return Cast(b, inputs[0], dst_ty);
+      return triton::Cast(b, inputs[0], dst_ty);
     }
     case HloOpcode::kAdd:
       if (is_integer) {
@@ -661,7 +546,7 @@ absl::StatusOr<Value> EmitScope(
     if (hlo->opcode() == HloOpcode::kConvert &&
         hlo->operand(0)->shape().element_type() == S4) {
       Value unpacked;
-      unpacked = Cast(b, values[hlo->operand(0)], b.getI8Type());
+      unpacked = triton::Cast(b, values[hlo->operand(0)], b.getI8Type());
       std::vector<Value> operands({unpacked});
       TF_ASSIGN_OR_RETURN(result, EmitElementwise(b, libdevice_path,
                                                   device_info, *hlo, operands));
@@ -817,7 +702,7 @@ ma::ConstantOp Cst64(EmitterLocOpBuilder b, int64_t v) {
 }
 
 Value RoundToBF16(EmitterLocOpBuilder b, Value input) {
-  return Cast(b, input, b.getBF16Type());
+  return triton::Cast(b, input, b.getBF16Type());
 };
 
 /*static*/ absl::StatusOr<MatMulDims> MatMulDims::Create(
@@ -1480,7 +1365,7 @@ class MatMulEmitterHelper {
           "64 bit dynamic-slice indices are not supported yet.");
     }
     majormost_dim_start_index_val =
-        Cast(b, majormost_dim_start_index_val, b.getI32Type());
+        triton::Cast(b, majormost_dim_start_index_val, b.getI32Type());
     majormost_dim_start_index_val =
         b.create<ma::MaxSIOp>(majormost_dim_start_index_val, Cst32(b, 0));
     majormost_dim_start_index_val =
@@ -2041,7 +1926,7 @@ class IterableInput {
     Value param_value = EmitParameterLoad(b, args.front(), boundary_checks_);
     if (type_ != storage_type_) {
       // For example cast i8 to i1.
-      param_value = Cast(b, param_value, type_);
+      param_value = triton::Cast(b, param_value, type_);
     }
     return param_value;
   }
@@ -2167,10 +2052,10 @@ Value EmitRegularMatmul(EmitterLocOpBuilder& b, Value lhs, Value rhs, Value acc,
   if (dot_instr->precision_config().algorithm() ==
       PrecisionConfig::ALG_DOT_BF16_BF16_F32) {
     if (dot_instr->operand(0)->shape().element_type() == F32) {
-      lhs = Cast(b, lhs, b.getBF16Type());
+      lhs = triton::Cast(b, lhs, b.getBF16Type());
     }
     if (dot_instr->operand(1)->shape().element_type() == F32) {
-      rhs = Cast(b, rhs, b.getBF16Type());
+      rhs = triton::Cast(b, rhs, b.getBF16Type());
     }
   }
 
@@ -2364,7 +2249,7 @@ absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
   absl::flat_hash_map<const HloInstruction*, Value> values_out;
   TF_ASSIGN_OR_RETURN(Type acc_final_ty,
                       TritonType(b, dot_instr->shape().element_type()));
-  values_out[dot_instr] = Cast(b, acc_final, acc_final_ty);
+  values_out[dot_instr] = triton::Cast(b, acc_final, acc_final_ty);
 
   // Emit the output scope.
   if (std::vector<const HloInstruction*> to_emit =