Support target feature propagation to llvm backend. (#1049)

shahidact · web-flow · commit 42a7655dfa80 · 2025-05-22T21:26:00.000+05:30
Add support to construct TargetMachine based on the target feature which
can be propagated to llvm backend. Target feature can be specified from
command line flag '--target-feature' to tpp-run tool. This flag can be
used by passes which needs knowledge of cpu target feature.
diff --git a/include/TPP/PassBundles.td b/include/TPP/PassBundles.td
@@ -20,6 +20,9 @@ def DefaultPipeline : Pass<"default-pipeline", "ModuleOp"> {
     Option<"gpuBackend", "gpu", "std::string",
             /*default=*/"\"\"",
            "Optional target GPU backend.">,
+    Option<"pipelineCpuTargetFeature", "target-feature", "std::string",
+            /*default=*/"",
+           "Optional CPU target feature.">,
   ];
 }
 
@@ -44,6 +47,8 @@ def DefaultTppPasses : Pass<"default-tpp-passes", "ModuleOp"> {
     Option<"vectorToKernel", "vector-to-kernel",
            "bool", /*default=*/"false",
            "Lower vector patterns to micro-kernels.">,
+    Option<"defBundleCpuTargetFeature", "target-feature", "std::string", "",
+                "Target feature for a given architecture">,
     Option<"lowerPackUnpackWithoutTranspose", "lower-pack-unpack-without-transpose",
            "bool", /*default=*/"false",
            "Lower non-constant packs and unpacks reverting any dim permutations.">,
@@ -97,6 +102,10 @@ def VectorToKernel : Pass<"vector-to-kernel", "ModuleOp"> {
   let summary = "Lower Vector operations to micro-kernel special lowering.";
   let dependentDialects = ["vector::VectorDialect",
                            "scf::SCFDialect"];
+  let options= [
+    Option<"vecBundleCpuTargetFeature", "target-feature", "std::string", "",
+                "Target feature for a given architecture">
+  ];
 }
 
 def LowLevelParallelization : Pass<"low-level-parallel", "ModuleOp"> {
diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td
@@ -103,6 +103,10 @@ def VectorContractToFMA : Pass<
                            "tensor::TensorDialect",
                            "vector::VectorDialect",
                            "arith::ArithDialect"];
+  let options = [
+         Option<"targetFeature", "target-feature", "std::string", "",
+                "Target feature for a given architecture">,
+  ];
 }
 
 def VectorContractToAMX : Pass<
@@ -532,6 +536,9 @@ def TppRunnerWrapper : Pass<"tpp-runner-wrapper", "ModuleOp">{
     Option<"backend", "backend", "std::string",
             /*default=*/"\"cpu\"",
            "Kernel target device backend (cpu, cuda, intel).">,
+    Option<"wrapperCpuTargetFeature", "target-feature", "std::string",
+            /*default=*/"",
+           "CPU target feature (avx, avx2, avx512f, avx512vnni, avx512bf16, amx, amx_bf16, amx_tile, neon, sve).">,
     Option<"offloadToDevice", "offload-on-device", "bool",
             /*default=*/"true",
            "Offload kernel arguments to the target device.">,
diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp
@@ -159,6 +159,7 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
       tppDefaultOptions.registerBlocking =
           SmallVector<unsigned>{registerBlocking.begin(), registerBlocking.end()};
       tppDefaultOptions.vectorToKernel = vectorToKernel;
+      tppDefaultOptions.defBundleCpuTargetFeature = pipelineCpuTargetFeature;
 
       pm.addPass(createDefaultTppPasses(tppDefaultOptions));
     }
diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp
@@ -151,7 +151,9 @@ struct DefaultTppPasses
           pm.addPass(createVectorToXSMM());
         }
         if (vectorToKernel) {
-          pm.addPass(createVectorToKernel());
+          VectorToKernelOptions options;
+          options.vecBundleCpuTargetFeature = defBundleCpuTargetFeature;
+          pm.addPass(createVectorToKernel(options));
         }
       }
 
diff --git a/lib/TPP/PassBundles/VectorToKernel.cpp b/lib/TPP/PassBundles/VectorToKernel.cpp
@@ -35,6 +35,8 @@ namespace tpp {
 // specialized micro-kernels akin to libxsmm kernels.
 struct VectorToKernel : public tpp::impl::VectorToKernelBase<VectorToKernel>,
                     PassBundle<ModuleOp> {
+  using VectorToKernelBase::VectorToKernelBase;
+
   void runOnOperation() override {
     auto module = getOperation();
 
@@ -59,6 +61,8 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase<VectorToKernel>,
     if (vnni::utils::hasAMX())
       pm.addNestedPass<func::FuncOp>(createVectorContractToAMX());
     pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-    pm.addNestedPass<func::FuncOp>(createVectorContractToFMA());
+    VectorContractToFMAOptions options;
+    options.targetFeature = vecBundleCpuTargetFeature;
+    pm.addNestedPass<func::FuncOp>(createVectorContractToFMA(options));
   }
 };
diff --git a/lib/TPP/Transforms/VectorContractToFMA.cpp b/lib/TPP/Transforms/VectorContractToFMA.cpp
@@ -9,6 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "TPP/Passes.h"
 #include "TPP/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
diff --git a/test/BF16/Integration/amx/vector-contract-to-amx-gemm.mlir b/test/BF16/Integration/amx/vector-contract-to-amx-gemm.mlir
@@ -1,5 +1,5 @@
 // RUN: tpp-run -e entry --entry-point-result=void -print --splat-to-random --init-type normal  -seed 123  %s > %t.1
-// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -fpu=amx-bf16 -print  --splat-to-random --init-type normal  -seed 123  > %t.2
+// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -print  --splat-to-random --init-type normal  -seed 123  > %t.2
 // RUN: fpcmp -r 0.001 %t.1 %t.2
 
 
diff --git a/test/BF16/Integration/amx/vector-contract-to-amx-mlp.mlir b/test/BF16/Integration/amx/vector-contract-to-amx-mlp.mlir
@@ -1,5 +1,5 @@
 // RUN: tpp-run -e entry --entry-point-result=void -print --splat-to-random --init-type normal  -seed 123  %s > %t.1
-// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -fpu=amx-bf16 -print  --splat-to-random --init-type normal  -seed 123  > %t.2
+// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -print  --splat-to-random --init-type normal  -seed 123  > %t.2
 // RUN: fpcmp -r 0.001 %t.1 %t.2
 
 
diff --git a/test/Integration/tpp-run.mlir b/test/Integration/tpp-run.mlir
@@ -11,6 +11,9 @@
 // RUN: tpp-run %s -e entry -entry-point-result=void -n 2 -print-mlir=late 2>&1 | FileCheck %s --check-prefix=BENCH_STATS_2
 // RUN: tpp-run %s -e entry -entry-point-result=void -n 10 -print-mlir=late 2>&1 | FileCheck %s --check-prefix=BENCH_STATS_10
 
+// CPU target-feature propagation test
+// RUN: tpp-run %s -e entry -entry-point-result=void -print-mlir=llvm --target-feature=testfeature 2>&1 | FileCheck %s --check-prefix=CPU_FEATURE
+
 // CPU options can't be tested as even the LLVM IR is identical
 // Splat and init options in tpp-run-splat-* tests
 
@@ -159,3 +162,6 @@ func.func @entry(%A: tensor<4x8xf32>,
 // BENCH_STATS_10: call @_entry
 // BENCH_STATS_10-NOT: call @_entry
 // BENCH_STATS_10: {{[0-9]+}}{{.?}}{{[0-9e-]+}}
+
+// CPU_FEATURE-LABLE: @entry
+// CPU_FEATURE: '+testfeature' is not a recognized feature for this target (ignoring feature)
diff --git a/tools/tpp-run/tpp-run.cpp b/tools/tpp-run/tpp-run.cpp
@@ -89,41 +89,6 @@ llvm::cl::opt<unsigned>
     optLevel("O", llvm::cl::desc("Speed optimization level (O0, O1, O2, O3)"),
              llvm::cl::value_desc("0-3"), llvm::cl::init(2));
 
-// Target Triple
-// Default x86_64, can be changed to aarch64 on other arches
-llvm::cl::opt<std::string> triple("triple", llvm::cl::desc("Target triple"),
-#if defined(__x86_64__)
-                                  llvm::cl::init("x86_64-linux-gnu"));
-#elif defined(__aarch64__)
-                                  llvm::cl::init("aarch64-linux-gnu"));
-#else
-#error Unsupported architecture
-#endif
-
-// Target CPU name
-// Default skylake is old enough to be relevant for most cases
-llvm::cl::opt<std::string>
-    cpuName("cpu", llvm::cl::desc("CPU name (sapphirerapids, alderlake, etc)"),
-#if defined(__x86_64__)
-            llvm::cl::init("nehalem"));
-#elif defined(__aarch64__)
-            llvm::cl::init("cortex-a53"));
-#else
-#error Unsupported architecture
-#endif
-
-// Target FPU name
-// Default avx2 is old enough to be relevant for most cases
-llvm::cl::opt<std::string>
-    fpuName("fpu", llvm::cl::desc("FPU name (avx, avx2, avx512bf16)"),
-#if defined(__x86_64__)
-            llvm::cl::init("sse4.2"));
-#elif defined(__aarch64__)
-            llvm::cl::init("neon"));
-#else
-#error Unsupported architecture
-#endif
-
 // Initializer type
 // Default const if seed == 0, and normal otherwise
 llvm::cl::opt<std::string> initType(
@@ -141,13 +106,59 @@ llvm::cl::opt<std::string>
     defGpuBackend("gpu", llvm::cl::desc("Target GPU backend for lowering"),
                   llvm::cl::value_desc("cuda,intel"), llvm::cl::init(""));
 
+// Select target CPU feature for the pipeline.
+llvm::cl::opt<std::string> runnerCpuTargetFeature(
+    "target-feature", llvm::cl::desc("Specify CPU target feature for lowering"),
+    llvm::cl::value_desc("avx, avx2, avx512f, avx512vnni, avx512bf16, amx, "
+                         "amx_bf16, amx_tile, neon, sve"),
+    llvm::cl::init(""));
+
 // Kernel buffers - arguments and return values - are expected to be allocated
 // on GPU.
 llvm::cl::opt<bool>
     defGpuArgs("gpu-args",
                llvm::cl::desc("Kernel buffers are allocated on GPU"),
                llvm::cl::init(true));
 
+struct TargetMachineOptions {
+  std::string triple;
+  std::string cpu;
+  std::string features;
+};
+
+/// Returns the target machine options for the given CPU feature string.
+/// Does not include full support for all CPU features, only the ones that are
+/// relevant for now.
+TargetMachineOptions getTargetMachineOptions(StringRef option) {
+  std::string defaultCpu = "";
+  std::string defaultFeature = "";
+  std::string defaultTriple = "";
+#if defined(__x86_64__)
+  defaultTriple = "x86_64-linux-gnu";
+  defaultCpu = "nehalem";
+  defaultFeature = "+sse4.2";
+#elif defined(__aarch64__)
+  defaultTriple = "aarch64-linux-gnu";
+  defaultCpu = "cortex-a53";
+  defaultFeature = "+neon";
+#else
+#error Unsupported architecture
+#endif
+  return StringSwitch<TargetMachineOptions>(option)
+      .Case("avx", {"x86_64-linux-gnu", "sandybridge", "+avx"})
+      .Case("avx2", {"x86_64-linux-gnu", "haswell", "+avx2"})
+      .Case("avx512f", {"x86_64-linux-gnu", "skylake-avx512", "+avx512f"})
+      .Case("avx512vnni", {"x86_64-linux-gnu", "znver4", "+avx512vnni"})
+      .Case("avx512bf16", {"x86_64-linux-gnu", "cooperlake", "+avx512bf16"})
+      .Case("amx", {"x86_64-linux-gnu", "sapphirerapids", "+amx"})
+      .Case("amx_bf16", {"x86_64-linux-gnu", "sapphirerapids", "+amx_bf16"})
+      .Case("amx_tile", {"x86_64-linux-gnu", "sapphirerapids", "+amx_tile"})
+      .Case("neon", {"armv8a-linux-gnu", "cortex-a53", "+neon"})
+      .Case("sve", {"armv8a-linux-gnu", "a64fx", "+sve"})
+      .Case("testfeature", {"x86_64-linux-gnu", "sandybridge", "+testfeature"})
+      .Default({defaultTriple, defaultCpu, defaultFeature});
+}
+
 // This function will be called by the pass manager after parsing,
 // so we can modify the IR with the needed wrappers
 static LogicalResult prepareMLIRKernel(Operation *op,
@@ -167,6 +178,7 @@ static LogicalResult prepareMLIRKernel(Operation *op,
   wrapperOpts.kernelName = options.mainFuncName;
   wrapperOpts.kernelType = options.mainFuncType;
   wrapperOpts.backend = defGpuBackend;
+  wrapperOpts.wrapperCpuTargetFeature = runnerCpuTargetFeature;
   wrapperOpts.offloadToDevice = defGpuArgs;
   wrapperOpts.numBenchLoops = benchNumLoops;
   // Warmup on GPUs are currently breaking buffer allocation on GPUs
@@ -177,7 +189,8 @@ static LogicalResult prepareMLIRKernel(Operation *op,
   wrapperOpts.initType = initType;
   passManager.addPass(tpp::createTppRunnerWrapper(wrapperOpts));
 
-  tpp::DefaultPipelineOptions defPipelineOpts{defGpuBackend};
+  tpp::DefaultPipelineOptions defPipelineOpts{defGpuBackend,
+                                              runnerCpuTargetFeature};
   passManager.addPass(tpp::createDefaultPipeline(defPipelineOpts));
 
   auto result = passManager.run(module);
@@ -198,34 +211,36 @@ std::unique_ptr<llvm::Module> lowerToLLVMIR(Operation *module,
 
   // Target machine, null if not specified
   std::unique_ptr<llvm::TargetMachine> targetMachine;
+  TargetMachineOptions targetMachineOptStr =
+      getTargetMachineOptions(runnerCpuTargetFeature);
 
   // Specify target machine
-  if (!triple.empty() && !cpuName.empty()) {
-    std::string error;
-    const llvm::Target *target =
-        llvm::TargetRegistry::lookupTarget(triple, error);
-    if (!target) {
-      llvm::errs() << "Error while looking up target triple: ";
-      llvm::errs() << error << "\n";
-      return nullptr;
-    }
-
-    auto codeGenOpt = (llvm::CodeGenOptLevel)optLevel.getValue();
-
-    // These options should force fused MLA, but they don't. :/
-    // Adding unsafe math attribute to functions below do the trick.
-    llvm::TargetOptions targetOptions;
-    targetOptions.UnsafeFPMath = true;
-    targetOptions.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
-    targetMachine.reset(target->createTargetMachine(
-        llvm::Triple(triple), cpuName, "+" + fpuName, targetOptions,
-        /* reloc model */ std::nullopt,
-        /* code model */ std::nullopt, codeGenOpt));
-    if (!targetMachine) {
-      llvm::errs() << "Error while looking up target CPU: ";
-      llvm::errs() << cpuName << "\n";
-      return nullptr;
-    }
+  std::string error;
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(targetMachineOptStr.triple, error);
+  if (!target) {
+    llvm::errs() << "Error while looking up target triple: ";
+    llvm::errs() << error << "\n";
+    return nullptr;
+  }
+
+  auto codeGenOpt = (llvm::CodeGenOptLevel)optLevel.getValue();
+
+  // These options should force fused MLA, but they don't. :/
+  // Adding unsafe math attribute to functions below do the trick.
+  llvm::TargetOptions targetOptions;
+  targetOptions.UnsafeFPMath = true;
+  targetOptions.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
+  targetMachine.reset(target->createTargetMachine(
+      llvm::Triple(targetMachineOptStr.triple), targetMachineOptStr.cpu,
+      targetMachineOptStr.features, targetOptions,
+      /* reloc model */ std::nullopt,
+      /* code model */ std::nullopt, codeGenOpt));
+
+  if (!targetMachine) {
+    llvm::errs() << "Error while looking up target CPU: ";
+    llvm::errs() << targetMachineOptStr.cpu << "\n";
+    return nullptr;
   }
 
   // Run the optimized pipeline

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,7 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,`
`159`	`159`	`tppDefaultOptions.registerBlocking =`
`160`	`160`	`SmallVector<unsigned>{registerBlocking.begin(), registerBlocking.end()};`
`161`	`161`	`tppDefaultOptions.vectorToKernel = vectorToKernel;`
	`162`	`+ tppDefaultOptions.defBundleCpuTargetFeature = pipelineCpuTargetFeature;`
`162`	`163`
`163`	`164`	`pm.addPass(createDefaultTppPasses(tppDefaultOptions));`
`164`	`165`	`}`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,9 @@ struct DefaultTppPasses`
`151`	`151`	`pm.addPass(createVectorToXSMM());`
`152`	`152`	`}`
`153`	`153`	`if (vectorToKernel) {`
`154`		`- pm.addPass(createVectorToKernel());`
	`154`	`+ VectorToKernelOptions options;`
	`155`	`+ options.vecBundleCpuTargetFeature = defBundleCpuTargetFeature;`
	`156`	`+ pm.addPass(createVectorToKernel(options));`
`155`	`157`	`}`
`156`	`158`	`}`
`157`	`159`