@@ -89,41 +89,6 @@ llvm::cl::opt<unsigned>
89
89
optLevel (" O" , llvm::cl::desc(" Speed optimization level (O0, O1, O2, O3)" ),
90
90
llvm::cl::value_desc(" 0-3" ), llvm::cl::init(2 ));
91
91
92
- // Target Triple
93
- // Default x86_64, can be changed to aarch64 on other arches
94
- llvm::cl::opt<std::string> triple (" triple" , llvm::cl::desc(" Target triple" ),
95
- #if defined(__x86_64__)
96
- llvm::cl::init (" x86_64-linux-gnu" ));
97
- #elif defined(__aarch64__)
98
- llvm::cl::init (" aarch64-linux-gnu" ));
99
- #else
100
- #error Unsupported architecture
101
- #endif
102
-
103
- // Target CPU name
104
- // Default skylake is old enough to be relevant for most cases
105
- llvm::cl::opt<std::string>
106
- cpuName (" cpu" , llvm::cl::desc(" CPU name (sapphirerapids, alderlake, etc)" ),
107
- #if defined(__x86_64__)
108
- llvm::cl::init (" nehalem" ));
109
- #elif defined(__aarch64__)
110
- llvm::cl::init (" cortex-a53" ));
111
- #else
112
- #error Unsupported architecture
113
- #endif
114
-
115
- // Target FPU name
116
- // Default avx2 is old enough to be relevant for most cases
117
- llvm::cl::opt<std::string>
118
- fpuName (" fpu" , llvm::cl::desc(" FPU name (avx, avx2, avx512bf16)" ),
119
- #if defined(__x86_64__)
120
- llvm::cl::init (" sse4.2" ));
121
- #elif defined(__aarch64__)
122
- llvm::cl::init (" neon" ));
123
- #else
124
- #error Unsupported architecture
125
- #endif
126
-
127
92
// Initializer type
128
93
// Default const if seed == 0, and normal otherwise
129
94
llvm::cl::opt<std::string> initType (
@@ -141,13 +106,59 @@ llvm::cl::opt<std::string>
141
106
defGpuBackend (" gpu" , llvm::cl::desc(" Target GPU backend for lowering" ),
142
107
llvm::cl::value_desc(" cuda,intel" ), llvm::cl::init(" " ));
143
108
109
+ // Select target CPU feature for the pipeline.
110
+ llvm::cl::opt<std::string> runnerCpuTargetFeature (
111
+ " target-feature" , llvm::cl::desc(" Specify CPU target feature for lowering" ),
112
+ llvm::cl::value_desc(" avx, avx2, avx512f, avx512vnni, avx512bf16, amx, "
113
+ " amx_bf16, amx_tile, neon, sve" ),
114
+ llvm::cl::init(" " ));
115
+
144
116
// Kernel buffers - arguments and return values - are expected to be allocated
145
117
// on GPU.
146
118
llvm::cl::opt<bool >
147
119
defGpuArgs (" gpu-args" ,
148
120
llvm::cl::desc (" Kernel buffers are allocated on GPU" ),
149
121
llvm::cl::init(true ));
150
122
123
+ struct TargetMachineOptions {
124
+ std::string triple;
125
+ std::string cpu;
126
+ std::string features;
127
+ };
128
+
129
+ // / Returns the target machine options for the given CPU feature string.
130
+ // / Does not include full support for all CPU features, only the ones that are
131
+ // / relevant for now.
132
+ TargetMachineOptions getTargetMachineOptions (StringRef option) {
133
+ std::string defaultCpu = " " ;
134
+ std::string defaultFeature = " " ;
135
+ std::string defaultTriple = " " ;
136
+ #if defined(__x86_64__)
137
+ defaultTriple = " x86_64-linux-gnu" ;
138
+ defaultCpu = " nehalem" ;
139
+ defaultFeature = " +sse4.2" ;
140
+ #elif defined(__aarch64__)
141
+ defaultTriple = " aarch64-linux-gnu" ;
142
+ defaultCpu = " cortex-a53" ;
143
+ defaultFeature = " +neon" ;
144
+ #else
145
+ #error Unsupported architecture
146
+ #endif
147
+ return StringSwitch<TargetMachineOptions>(option)
148
+ .Case (" avx" , {" x86_64-linux-gnu" , " sandybridge" , " +avx" })
149
+ .Case (" avx2" , {" x86_64-linux-gnu" , " haswell" , " +avx2" })
150
+ .Case (" avx512f" , {" x86_64-linux-gnu" , " skylake-avx512" , " +avx512f" })
151
+ .Case (" avx512vnni" , {" x86_64-linux-gnu" , " znver4" , " +avx512vnni" })
152
+ .Case (" avx512bf16" , {" x86_64-linux-gnu" , " cooperlake" , " +avx512bf16" })
153
+ .Case (" amx" , {" x86_64-linux-gnu" , " sapphirerapids" , " +amx" })
154
+ .Case (" amx_bf16" , {" x86_64-linux-gnu" , " sapphirerapids" , " +amx_bf16" })
155
+ .Case (" amx_tile" , {" x86_64-linux-gnu" , " sapphirerapids" , " +amx_tile" })
156
+ .Case (" neon" , {" armv8a-linux-gnu" , " cortex-a53" , " +neon" })
157
+ .Case (" sve" , {" armv8a-linux-gnu" , " a64fx" , " +sve" })
158
+ .Case (" testfeature" , {" x86_64-linux-gnu" , " sandybridge" , " +testfeature" })
159
+ .Default ({defaultTriple, defaultCpu, defaultFeature});
160
+ }
161
+
151
162
// This function will be called by the pass manager after parsing,
152
163
// so we can modify the IR with the needed wrappers
153
164
static LogicalResult prepareMLIRKernel (Operation *op,
@@ -167,6 +178,7 @@ static LogicalResult prepareMLIRKernel(Operation *op,
167
178
wrapperOpts.kernelName = options.mainFuncName ;
168
179
wrapperOpts.kernelType = options.mainFuncType ;
169
180
wrapperOpts.backend = defGpuBackend;
181
+ wrapperOpts.wrapperCpuTargetFeature = runnerCpuTargetFeature;
170
182
wrapperOpts.offloadToDevice = defGpuArgs;
171
183
wrapperOpts.numBenchLoops = benchNumLoops;
172
184
// Warmup on GPUs are currently breaking buffer allocation on GPUs
@@ -177,7 +189,8 @@ static LogicalResult prepareMLIRKernel(Operation *op,
177
189
wrapperOpts.initType = initType;
178
190
passManager.addPass (tpp::createTppRunnerWrapper (wrapperOpts));
179
191
180
- tpp::DefaultPipelineOptions defPipelineOpts{defGpuBackend};
192
+ tpp::DefaultPipelineOptions defPipelineOpts{defGpuBackend,
193
+ runnerCpuTargetFeature};
181
194
passManager.addPass (tpp::createDefaultPipeline (defPipelineOpts));
182
195
183
196
auto result = passManager.run (module);
@@ -198,34 +211,36 @@ std::unique_ptr<llvm::Module> lowerToLLVMIR(Operation *module,
198
211
199
212
// Target machine, null if not specified
200
213
std::unique_ptr<llvm::TargetMachine> targetMachine;
214
+ TargetMachineOptions targetMachineOptStr =
215
+ getTargetMachineOptions (runnerCpuTargetFeature);
201
216
202
217
// Specify target machine
203
- if (!triple. empty () && !cpuName. empty ()) {
204
- std::string error;
205
- const llvm::Target *target =
206
- llvm::TargetRegistry::lookupTarget (triple, error);
207
- if (!target) {
208
- llvm::errs () << " Error while looking up target triple: " ;
209
- llvm::errs () << error << " \n " ;
210
- return nullptr ;
211
- }
212
-
213
- auto codeGenOpt = (llvm::CodeGenOptLevel)optLevel. getValue ();
214
-
215
- // These options should force fused MLA, but they don't. :/
216
- // Adding unsafe math attribute to functions below do the trick.
217
- llvm::TargetOptions targetOptions ;
218
- targetOptions.UnsafeFPMath = true ;
219
- targetOptions. AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
220
- targetMachine. reset (target-> createTargetMachine (
221
- llvm::Triple (triple), cpuName, " + " + fpuName , targetOptions,
222
- /* reloc model */ std::nullopt,
223
- /* code model */ std::nullopt, codeGenOpt));
224
- if (!targetMachine) {
225
- llvm::errs () << " Error while looking up target CPU: " ;
226
- llvm::errs () << cpuName << " \n " ;
227
- return nullptr ;
228
- }
218
+ std::string error;
219
+ const llvm::Target *target =
220
+ llvm::TargetRegistry::lookupTarget (targetMachineOptStr. triple , error);
221
+ if (!target) {
222
+ llvm::errs () << " Error while looking up target triple: " ;
223
+ llvm::errs () << error << " \n " ;
224
+ return nullptr ;
225
+ }
226
+
227
+ auto codeGenOpt = (llvm::CodeGenOptLevel)optLevel. getValue ();
228
+
229
+ // These options should force fused MLA, but they don't. :/
230
+ // Adding unsafe math attribute to functions below do the trick.
231
+ llvm::TargetOptions targetOptions;
232
+ targetOptions. UnsafeFPMath = true ;
233
+ targetOptions.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast ;
234
+ targetMachine. reset (target-> createTargetMachine (
235
+ llvm::Triple (targetMachineOptStr. triple ), targetMachineOptStr. cpu ,
236
+ targetMachineOptStr. features , targetOptions,
237
+ /* reloc model */ std::nullopt,
238
+ /* code model */ std::nullopt, codeGenOpt));
239
+
240
+ if (!targetMachine) {
241
+ llvm::errs () << " Error while looking up target CPU: " ;
242
+ llvm::errs () << targetMachineOptStr. cpu << " \n " ;
243
+ return nullptr ;
229
244
}
230
245
231
246
// Run the optimized pipeline
0 commit comments