Skip to content

Commit 42a7655

Browse files
authored
Support target feature propagation to llvm backend. (#1049)
Add support to construct TargetMachine based on the target feature which can be propagated to llvm backend. Target feature can be specified from command line flag '--target-feature' to tpp-run tool. This flag can be used by passes which needs knowledge of cpu target feature.
1 parent 61536e4 commit 42a7655

File tree

10 files changed

+111
-66
lines changed

10 files changed

+111
-66
lines changed

include/TPP/PassBundles.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ def DefaultPipeline : Pass<"default-pipeline", "ModuleOp"> {
2020
Option<"gpuBackend", "gpu", "std::string",
2121
/*default=*/"\"\"",
2222
"Optional target GPU backend.">,
23+
Option<"pipelineCpuTargetFeature", "target-feature", "std::string",
24+
/*default=*/"",
25+
"Optional CPU target feature.">,
2326
];
2427
}
2528

@@ -44,6 +47,8 @@ def DefaultTppPasses : Pass<"default-tpp-passes", "ModuleOp"> {
4447
Option<"vectorToKernel", "vector-to-kernel",
4548
"bool", /*default=*/"false",
4649
"Lower vector patterns to micro-kernels.">,
50+
Option<"defBundleCpuTargetFeature", "target-feature", "std::string", "",
51+
"Target feature for a given architecture">,
4752
Option<"lowerPackUnpackWithoutTranspose", "lower-pack-unpack-without-transpose",
4853
"bool", /*default=*/"false",
4954
"Lower non-constant packs and unpacks reverting any dim permutations.">,
@@ -97,6 +102,10 @@ def VectorToKernel : Pass<"vector-to-kernel", "ModuleOp"> {
97102
let summary = "Lower Vector operations to micro-kernel special lowering.";
98103
let dependentDialects = ["vector::VectorDialect",
99104
"scf::SCFDialect"];
105+
let options= [
106+
Option<"vecBundleCpuTargetFeature", "target-feature", "std::string", "",
107+
"Target feature for a given architecture">
108+
];
100109
}
101110

102111
def LowLevelParallelization : Pass<"low-level-parallel", "ModuleOp"> {

include/TPP/Passes.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ def VectorContractToFMA : Pass<
103103
"tensor::TensorDialect",
104104
"vector::VectorDialect",
105105
"arith::ArithDialect"];
106+
let options = [
107+
Option<"targetFeature", "target-feature", "std::string", "",
108+
"Target feature for a given architecture">,
109+
];
106110
}
107111

108112
def VectorContractToAMX : Pass<
@@ -532,6 +536,9 @@ def TppRunnerWrapper : Pass<"tpp-runner-wrapper", "ModuleOp">{
532536
Option<"backend", "backend", "std::string",
533537
/*default=*/"\"cpu\"",
534538
"Kernel target device backend (cpu, cuda, intel).">,
539+
Option<"wrapperCpuTargetFeature", "target-feature", "std::string",
540+
/*default=*/"",
541+
"CPU target feature (avx, avx2, avx512f, avx512vnni, avx512bf16, amx, amx_bf16, amx_tile, neon, sve).">,
535542
Option<"offloadToDevice", "offload-on-device", "bool",
536543
/*default=*/"true",
537544
"Offload kernel arguments to the target device.">,

lib/TPP/DefaultPipeline.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
159159
tppDefaultOptions.registerBlocking =
160160
SmallVector<unsigned>{registerBlocking.begin(), registerBlocking.end()};
161161
tppDefaultOptions.vectorToKernel = vectorToKernel;
162+
tppDefaultOptions.defBundleCpuTargetFeature = pipelineCpuTargetFeature;
162163

163164
pm.addPass(createDefaultTppPasses(tppDefaultOptions));
164165
}

lib/TPP/DefaultTppPasses.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,9 @@ struct DefaultTppPasses
151151
pm.addPass(createVectorToXSMM());
152152
}
153153
if (vectorToKernel) {
154-
pm.addPass(createVectorToKernel());
154+
VectorToKernelOptions options;
155+
options.vecBundleCpuTargetFeature = defBundleCpuTargetFeature;
156+
pm.addPass(createVectorToKernel(options));
155157
}
156158
}
157159

lib/TPP/PassBundles/VectorToKernel.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ namespace tpp {
3535
// specialized micro-kernels akin to libxsmm kernels.
3636
struct VectorToKernel : public tpp::impl::VectorToKernelBase<VectorToKernel>,
3737
PassBundle<ModuleOp> {
38+
using VectorToKernelBase::VectorToKernelBase;
39+
3840
void runOnOperation() override {
3941
auto module = getOperation();
4042

@@ -59,6 +61,8 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase<VectorToKernel>,
5961
if (vnni::utils::hasAMX())
6062
pm.addNestedPass<func::FuncOp>(createVectorContractToAMX());
6163
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
62-
pm.addNestedPass<func::FuncOp>(createVectorContractToFMA());
64+
VectorContractToFMAOptions options;
65+
options.targetFeature = vecBundleCpuTargetFeature;
66+
pm.addNestedPass<func::FuncOp>(createVectorContractToFMA(options));
6367
}
6468
};

lib/TPP/Transforms/VectorContractToFMA.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//
1010
//===----------------------------------------------------------------------===//
1111

12+
#include "TPP/Passes.h"
1213
#include "TPP/Transforms/Transforms.h"
1314
#include "mlir/Dialect/MemRef/IR/MemRef.h"
1415
#include "mlir/Dialect/SCF/IR/SCF.h"

test/BF16/Integration/amx/vector-contract-to-amx-gemm.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: tpp-run -e entry --entry-point-result=void -print --splat-to-random --init-type normal -seed 123 %s > %t.1
2-
// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -fpu=amx-bf16 -print --splat-to-random --init-type normal -seed 123 > %t.2
2+
// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -print --splat-to-random --init-type normal -seed 123 > %t.2
33
// RUN: fpcmp -r 0.001 %t.1 %t.2
44

55

test/BF16/Integration/amx/vector-contract-to-amx-mlp.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: tpp-run -e entry --entry-point-result=void -print --splat-to-random --init-type normal -seed 123 %s > %t.1
2-
// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -fpu=amx-bf16 -print --splat-to-random --init-type normal -seed 123 > %t.2
2+
// RUN: tpp-run %s -e entry --entry-point-result=void --vector-to-kernels --registerBlocking=32,32,32 -print --splat-to-random --init-type normal -seed 123 > %t.2
33
// RUN: fpcmp -r 0.001 %t.1 %t.2
44

55

test/Integration/tpp-run.mlir

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
// RUN: tpp-run %s -e entry -entry-point-result=void -n 2 -print-mlir=late 2>&1 | FileCheck %s --check-prefix=BENCH_STATS_2
1212
// RUN: tpp-run %s -e entry -entry-point-result=void -n 10 -print-mlir=late 2>&1 | FileCheck %s --check-prefix=BENCH_STATS_10
1313

14+
// CPU target-feature propagation test
15+
// RUN: tpp-run %s -e entry -entry-point-result=void -print-mlir=llvm --target-feature=testfeature 2>&1 | FileCheck %s --check-prefix=CPU_FEATURE
16+
1417
// CPU options can't be tested as even the LLVM IR is identical
1518
// Splat and init options in tpp-run-splat-* tests
1619

@@ -159,3 +162,6 @@ func.func @entry(%A: tensor<4x8xf32>,
159162
// BENCH_STATS_10: call @_entry
160163
// BENCH_STATS_10-NOT: call @_entry
161164
// BENCH_STATS_10: {{[0-9]+}}{{.?}}{{[0-9e-]+}}
165+
166+
// CPU_FEATURE-LABLE: @entry
167+
// CPU_FEATURE: '+testfeature' is not a recognized feature for this target (ignoring feature)

tools/tpp-run/tpp-run.cpp

Lines changed: 77 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -89,41 +89,6 @@ llvm::cl::opt<unsigned>
8989
optLevel("O", llvm::cl::desc("Speed optimization level (O0, O1, O2, O3)"),
9090
llvm::cl::value_desc("0-3"), llvm::cl::init(2));
9191

92-
// Target Triple
93-
// Default x86_64, can be changed to aarch64 on other arches
94-
llvm::cl::opt<std::string> triple("triple", llvm::cl::desc("Target triple"),
95-
#if defined(__x86_64__)
96-
llvm::cl::init("x86_64-linux-gnu"));
97-
#elif defined(__aarch64__)
98-
llvm::cl::init("aarch64-linux-gnu"));
99-
#else
100-
#error Unsupported architecture
101-
#endif
102-
103-
// Target CPU name
104-
// Default skylake is old enough to be relevant for most cases
105-
llvm::cl::opt<std::string>
106-
cpuName("cpu", llvm::cl::desc("CPU name (sapphirerapids, alderlake, etc)"),
107-
#if defined(__x86_64__)
108-
llvm::cl::init("nehalem"));
109-
#elif defined(__aarch64__)
110-
llvm::cl::init("cortex-a53"));
111-
#else
112-
#error Unsupported architecture
113-
#endif
114-
115-
// Target FPU name
116-
// Default avx2 is old enough to be relevant for most cases
117-
llvm::cl::opt<std::string>
118-
fpuName("fpu", llvm::cl::desc("FPU name (avx, avx2, avx512bf16)"),
119-
#if defined(__x86_64__)
120-
llvm::cl::init("sse4.2"));
121-
#elif defined(__aarch64__)
122-
llvm::cl::init("neon"));
123-
#else
124-
#error Unsupported architecture
125-
#endif
126-
12792
// Initializer type
12893
// Default const if seed == 0, and normal otherwise
12994
llvm::cl::opt<std::string> initType(
@@ -141,13 +106,59 @@ llvm::cl::opt<std::string>
141106
defGpuBackend("gpu", llvm::cl::desc("Target GPU backend for lowering"),
142107
llvm::cl::value_desc("cuda,intel"), llvm::cl::init(""));
143108

109+
// Select target CPU feature for the pipeline.
110+
llvm::cl::opt<std::string> runnerCpuTargetFeature(
111+
"target-feature", llvm::cl::desc("Specify CPU target feature for lowering"),
112+
llvm::cl::value_desc("avx, avx2, avx512f, avx512vnni, avx512bf16, amx, "
113+
"amx_bf16, amx_tile, neon, sve"),
114+
llvm::cl::init(""));
115+
144116
// Kernel buffers - arguments and return values - are expected to be allocated
145117
// on GPU.
146118
llvm::cl::opt<bool>
147119
defGpuArgs("gpu-args",
148120
llvm::cl::desc("Kernel buffers are allocated on GPU"),
149121
llvm::cl::init(true));
150122

123+
struct TargetMachineOptions {
124+
std::string triple;
125+
std::string cpu;
126+
std::string features;
127+
};
128+
129+
/// Returns the target machine options for the given CPU feature string.
130+
/// Does not include full support for all CPU features, only the ones that are
131+
/// relevant for now.
132+
TargetMachineOptions getTargetMachineOptions(StringRef option) {
133+
std::string defaultCpu = "";
134+
std::string defaultFeature = "";
135+
std::string defaultTriple = "";
136+
#if defined(__x86_64__)
137+
defaultTriple = "x86_64-linux-gnu";
138+
defaultCpu = "nehalem";
139+
defaultFeature = "+sse4.2";
140+
#elif defined(__aarch64__)
141+
defaultTriple = "aarch64-linux-gnu";
142+
defaultCpu = "cortex-a53";
143+
defaultFeature = "+neon";
144+
#else
145+
#error Unsupported architecture
146+
#endif
147+
return StringSwitch<TargetMachineOptions>(option)
148+
.Case("avx", {"x86_64-linux-gnu", "sandybridge", "+avx"})
149+
.Case("avx2", {"x86_64-linux-gnu", "haswell", "+avx2"})
150+
.Case("avx512f", {"x86_64-linux-gnu", "skylake-avx512", "+avx512f"})
151+
.Case("avx512vnni", {"x86_64-linux-gnu", "znver4", "+avx512vnni"})
152+
.Case("avx512bf16", {"x86_64-linux-gnu", "cooperlake", "+avx512bf16"})
153+
.Case("amx", {"x86_64-linux-gnu", "sapphirerapids", "+amx"})
154+
.Case("amx_bf16", {"x86_64-linux-gnu", "sapphirerapids", "+amx_bf16"})
155+
.Case("amx_tile", {"x86_64-linux-gnu", "sapphirerapids", "+amx_tile"})
156+
.Case("neon", {"armv8a-linux-gnu", "cortex-a53", "+neon"})
157+
.Case("sve", {"armv8a-linux-gnu", "a64fx", "+sve"})
158+
.Case("testfeature", {"x86_64-linux-gnu", "sandybridge", "+testfeature"})
159+
.Default({defaultTriple, defaultCpu, defaultFeature});
160+
}
161+
151162
// This function will be called by the pass manager after parsing,
152163
// so we can modify the IR with the needed wrappers
153164
static LogicalResult prepareMLIRKernel(Operation *op,
@@ -167,6 +178,7 @@ static LogicalResult prepareMLIRKernel(Operation *op,
167178
wrapperOpts.kernelName = options.mainFuncName;
168179
wrapperOpts.kernelType = options.mainFuncType;
169180
wrapperOpts.backend = defGpuBackend;
181+
wrapperOpts.wrapperCpuTargetFeature = runnerCpuTargetFeature;
170182
wrapperOpts.offloadToDevice = defGpuArgs;
171183
wrapperOpts.numBenchLoops = benchNumLoops;
172184
// Warmup on GPUs are currently breaking buffer allocation on GPUs
@@ -177,7 +189,8 @@ static LogicalResult prepareMLIRKernel(Operation *op,
177189
wrapperOpts.initType = initType;
178190
passManager.addPass(tpp::createTppRunnerWrapper(wrapperOpts));
179191

180-
tpp::DefaultPipelineOptions defPipelineOpts{defGpuBackend};
192+
tpp::DefaultPipelineOptions defPipelineOpts{defGpuBackend,
193+
runnerCpuTargetFeature};
181194
passManager.addPass(tpp::createDefaultPipeline(defPipelineOpts));
182195

183196
auto result = passManager.run(module);
@@ -198,34 +211,36 @@ std::unique_ptr<llvm::Module> lowerToLLVMIR(Operation *module,
198211

199212
// Target machine, null if not specified
200213
std::unique_ptr<llvm::TargetMachine> targetMachine;
214+
TargetMachineOptions targetMachineOptStr =
215+
getTargetMachineOptions(runnerCpuTargetFeature);
201216

202217
// Specify target machine
203-
if (!triple.empty() && !cpuName.empty()) {
204-
std::string error;
205-
const llvm::Target *target =
206-
llvm::TargetRegistry::lookupTarget(triple, error);
207-
if (!target) {
208-
llvm::errs() << "Error while looking up target triple: ";
209-
llvm::errs() << error << "\n";
210-
return nullptr;
211-
}
212-
213-
auto codeGenOpt = (llvm::CodeGenOptLevel)optLevel.getValue();
214-
215-
// These options should force fused MLA, but they don't. :/
216-
// Adding unsafe math attribute to functions below do the trick.
217-
llvm::TargetOptions targetOptions;
218-
targetOptions.UnsafeFPMath = true;
219-
targetOptions.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
220-
targetMachine.reset(target->createTargetMachine(
221-
llvm::Triple(triple), cpuName, "+" + fpuName, targetOptions,
222-
/* reloc model */ std::nullopt,
223-
/* code model */ std::nullopt, codeGenOpt));
224-
if (!targetMachine) {
225-
llvm::errs() << "Error while looking up target CPU: ";
226-
llvm::errs() << cpuName << "\n";
227-
return nullptr;
228-
}
218+
std::string error;
219+
const llvm::Target *target =
220+
llvm::TargetRegistry::lookupTarget(targetMachineOptStr.triple, error);
221+
if (!target) {
222+
llvm::errs() << "Error while looking up target triple: ";
223+
llvm::errs() << error << "\n";
224+
return nullptr;
225+
}
226+
227+
auto codeGenOpt = (llvm::CodeGenOptLevel)optLevel.getValue();
228+
229+
// These options should force fused MLA, but they don't. :/
230+
// Adding unsafe math attribute to functions below do the trick.
231+
llvm::TargetOptions targetOptions;
232+
targetOptions.UnsafeFPMath = true;
233+
targetOptions.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
234+
targetMachine.reset(target->createTargetMachine(
235+
llvm::Triple(targetMachineOptStr.triple), targetMachineOptStr.cpu,
236+
targetMachineOptStr.features, targetOptions,
237+
/* reloc model */ std::nullopt,
238+
/* code model */ std::nullopt, codeGenOpt));
239+
240+
if (!targetMachine) {
241+
llvm::errs() << "Error while looking up target CPU: ";
242+
llvm::errs() << targetMachineOptStr.cpu << "\n";
243+
return nullptr;
229244
}
230245

231246
// Run the optimized pipeline

0 commit comments

Comments
 (0)