Priority fusion: fix analysis of reduction epilogues.

jreiffers · copybara-github · commit f577aae9755a · 2023-11-08T00:36:42.000-08:00
The current code sometimes doesn't detect that a fusion
can be emitted using the reduction emitter.

PiperOrigin-RevId: 580430642
diff --git a/xla/service/gpu/hlo_traversal.cc b/xla/service/gpu/hlo_traversal.cc
@@ -59,6 +59,9 @@ FusionBoundaryFn MakeProducerConsumerFusion(
       // producer.
       return &fused_producer != &producer;
     }
+    if (&producer == &fused_consumer) {
+      return true;
+    }
 
     // Otherwise, fall back to the default; we're already in the fused
     // producer.
diff --git a/xla/service/gpu/ir_emission_utils.cc b/xla/service/gpu/ir_emission_utils.cc
@@ -723,15 +723,25 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
   return std::nullopt;
 }
 
-bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count) {
+bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count,
+                    FusionBoundaryFn boundary) {
   // Number of operands should be in range [1, allowed_operand_count].
   if (instr->operand_count() == 0 ||
       instr->operand_count() > allowed_operand_count) {
     return false;
   }
 
   // Intermediate `instr` can't have multiple users.
-  if (instr->user_count() > 1) {
+  // If we have a boundary function, only consider users within the
+  // boundary. This isn't really correct, since the real users aren't
+  // necessarily the instruction's users at this point.
+  // TODO(jreiffers): Figure out the point of this check.
+  int64_t num_users =
+      boundary ? absl::c_count_if(
+                     instr->users(),
+                     [&](const auto* user) { return !boundary(*instr, *user); })
+               : instr->user_count();
+  if (num_users > 1) {
     return false;
   }
 
@@ -780,7 +790,8 @@ const HloInstruction& FindNonTrivialHero(
       auto preds = FindPredecessors(*node, is_boundary);
       return preds.size() == 1 ? preds.front() : nullptr;
     }
-    return IsIntermediate(node) && !is_boundary(*node->operand(0), *node)
+    return IsIntermediate(node, 1, is_boundary) &&
+                   !is_boundary(*node->operand(0), *node)
                ? node->operand(0)
                : nullptr;
   };
diff --git a/xla/service/gpu/ir_emission_utils.h b/xla/service/gpu/ir_emission_utils.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/hlo_traversal.h"
 
 namespace xla {
 namespace gpu {
@@ -193,7 +194,8 @@ std::optional<TransposeDescription> FindTiledLogicalTranspose(
 std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     const HloInstruction& root, const HloInstruction& hero);
 
-bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1);
+bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1,
+                    FusionBoundaryFn boundary = nullptr);
 
 // Log the given module if the VLOG level is >= level.
 void VLogModule(int level, const llvm::Module& module);
diff --git a/xla/service/gpu/model/gpu_performance_model.cc b/xla/service/gpu/model/gpu_performance_model.cc
@@ -444,6 +444,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
   absl::Duration exec_time_fused = absl::ZeroDuration();
   absl::Duration producer_output_read_time_unfused = absl::ZeroDuration();
   for (const HloInstruction* fused_consumer : fused_consumers) {
+    VLOG(8) << "Consumer: " << fused_consumer->name();
     float utilization_by_this_consumer = cost_analysis->operand_utilization(
         *fused_consumer, fused_consumer->operand_index(producer));
     total_producer_utilization += utilization_by_this_consumer;
@@ -478,6 +479,9 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     absl::Duration input_access_time_by_this_consumer = ProducerInputAccessTime(
         cost_analysis, *device_info, launch_dimensions_fused.num_blocks(),
         producer, analysis_fused, config, fused_consumer);
+    VLOG(10) << "  Compute time by consumer: " << compute_time_by_this_consumer;
+    VLOG(10) << "  Input access time by consumer: "
+             << input_access_time_by_this_consumer;
 
     exec_time_fused += std::max(compute_time_by_this_consumer,
                                 input_access_time_by_this_consumer);
@@ -486,11 +490,14 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
                                          utilization_by_this_consumer);
     int64_t n_bytes_net = std::min(producer_data.bytes_written, n_bytes_total);
 
-    producer_output_read_time_unfused += ReadTime(
+    auto read_time_unfused = ReadTime(
         *device_info, launch_dimensions_unfused.num_blocks(), n_bytes_net,
         n_bytes_total, fused_consumer->shape().element_type(),
         /*coalesced=*/!TransposesMinorDimension(fused_consumer),
         config.first_read_from_dram);
+
+    VLOG(10) << "  Read time unfused: " << read_time_unfused;
+    producer_output_read_time_unfused += read_time_unfused;
   }
 
   absl::Duration time_unfused =
diff --git a/xla/service/gpu/priority_fusion_test.cc b/xla/service/gpu/priority_fusion_test.cc
@@ -549,5 +549,39 @@ CHECK: ROOT {{.*}} reduce(
   )");
 }
 
+TEST_F(PriorityFusionTest, FuseReductionEpilogueWithMultipleUsers) {
+  // Regression test that verifies we correctly fuse the `log` into the reduce.
+  constexpr absl::string_view kHlo = R"(
+    HloModule test_module
+
+    add {
+      x = f32[] parameter(0)
+      y = f32[] parameter(1)
+      ROOT add = f32[] add(x, y)
+    }
+
+    fused_computation {
+      p0 = f32[64,16384]{1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce.858 = f32[64]{0} reduce(p0, c0), dimensions={1}, to_apply=add
+    }
+
+    ENTRY main {
+      p0 = f32[64,16384]{1,0} parameter(0)
+      fusion = f32[64]{0} fusion(p0), kind=kInput, calls=fused_computation
+      log = f32[64]{0} log(fusion)
+      negate = f32[64]{0} custom-call(log), custom_call_target="negate"
+      ROOT add = f32[64]{0} add(negate, log)
+    }
+  )";
+
+  RunAndFilecheckHloRewrite(kHlo, std::move(priority_fusion_), R"(
+    CHECK: ENTRY
+    CHECK: %[[PARAM:.*]] = {{.*}} parameter(0)
+    CHECK: %[[FUSION:.*]] = {{.*}} fusion(%[[PARAM]])
+    CHECK: custom-call(%[[FUSION]])
+  )");
+}
+
 }  // namespace gpu
 }  // namespace xla

Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,9 @@ FusionBoundaryFn MakeProducerConsumerFusion(`
`59`	`59`	`// producer.`
`60`	`60`	`return &fused_producer != &producer;`
`61`	`61`	`}`
	`62`	`+ if (&producer == &fused_consumer) {`
	`63`	`+ return true;`
	`64`	`+ }`
`62`	`65`
`63`	`66`	`// Otherwise, fall back to the default; we're already in the fused`
`64`	`67`	`// producer.`