LLVM 20.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
49
50using namespace llvm;
51using namespace llvm::AMDGPU;
52
53// This should get the default rounding mode from the kernel. We just set the
54// default here, but this could change if the OpenCL rounding mode pragmas are
55// used.
56//
57// The denormal mode here should match what is reported by the OpenCL runtime
58// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
59// can also be override to flush with the -cl-denorms-are-zero compiler flag.
60//
61// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
62// precision, and leaves single precision to flush all and does not report
63// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
64// CL_FP_DENORM for both.
65//
66// FIXME: It seems some instructions do not support single precision denormals
67// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
68// and sin_f32, cos_f32 on most parts).
69
70// We want to use these instructions, and using fp32 denormals also causes
71// instructions to run at the double precision rate for the device so it's
72// probably best to just report no single precision denormals.
76 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
77 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
78}
79
80static AsmPrinter *
82 std::unique_ptr<MCStreamer> &&Streamer) {
83 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
84}
85
91}
92
94 std::unique_ptr<MCStreamer> Streamer)
95 : AsmPrinter(TM, std::move(Streamer)) {
96 assert(OutStreamer && "AsmPrinter constructed without streamer");
97}
98
100 return "AMDGPU Assembly Printer";
101}
102
104 return TM.getMCSubtargetInfo();
105}
106
108 if (!OutStreamer)
109 return nullptr;
110 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
111}
112
115}
116
117void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
119
120 // TODO: Which one is called first, emitStartOfAsmFile or
121 // emitFunctionBodyStart?
122 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
123 initializeTargetID(M);
124
127 return;
128
130
133 CodeObjectVersion);
134 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
135 }
136
139}
140
142 // Init target streamer if it has not yet happened
144 initTargetStreamer(M);
145
148
149 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
150 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
152 HSAMetadataStream->end();
153 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
154 (void)Success;
155 assert(Success && "Malformed HSA Metadata");
156 }
157}
158
161 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
162 const Function &F = MF->getFunction();
163
164 // TODO: We're checking this late, would be nice to check it earlier.
165 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
167 STM.getCPU() + " is only available on code object version 6 or better",
168 /*gen_crash_diag*/ false);
169 }
170
171 // TODO: Which one is called first, emitStartOfAsmFile or
172 // emitFunctionBodyStart?
173 if (!getTargetStreamer()->getTargetID())
174 initializeTargetID(*F.getParent());
175
176 const auto &FunctionTargetID = STM.getTargetID();
177 // Make sure function's xnack settings are compatible with module's
178 // xnack settings.
179 if (FunctionTargetID.isXnackSupported() &&
180 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
181 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
182 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
183 "' function does not match module xnack setting");
184 return;
185 }
186 // Make sure function's sramecc settings are compatible with module's
187 // sramecc settings.
188 if (FunctionTargetID.isSramEccSupported() &&
189 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
190 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
191 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
192 "' function does not match module sramecc setting");
193 return;
194 }
195
196 if (!MFI.isEntryFunction())
197 return;
198
199 if (STM.isMesaKernel(F) &&
200 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
201 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
202 AMDGPUMCKernelCodeT KernelCode;
203 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
204 KernelCode.validate(&STM, MF->getContext());
206 }
207
208 if (STM.isAmdHsaOS())
209 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
210}
211
214 if (!MFI.isEntryFunction())
215 return;
216
218 return;
219
220 auto &Streamer = getTargetStreamer()->getStreamer();
221 auto &Context = Streamer.getContext();
222 auto &ObjectFileInfo = *Context.getObjectFileInfo();
223 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
224
225 Streamer.pushSection();
226 Streamer.switchSection(&ReadOnlySection);
227
228 // CP microcode requires the kernel descriptor to be allocated on 64 byte
229 // alignment.
230 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
231 ReadOnlySection.ensureMinAlignment(Align(64));
232
233 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
234
235 SmallString<128> KernelName;
236 getNameWithPrefix(KernelName, &MF->getFunction());
238 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
239 CurrentProgramInfo.NumVGPRsForWavesPerEU,
241 CurrentProgramInfo.NumSGPRsForWavesPerEU,
243 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
244 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
245 Context),
246 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
247
248 Streamer.popSection();
249}
250
252 Register RegNo = MI->getOperand(0).getReg();
253
256 OS << "implicit-def: "
257 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
258
259 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
260 OS << " : SGPR spill to VGPR lane";
261
262 OutStreamer->AddComment(OS.str());
263 OutStreamer->addBlankLine();
264}
265
269 return;
270 }
271
273 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
274 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
275 SmallString<128> SymbolName;
276 getNameWithPrefix(SymbolName, &MF->getFunction()),
278 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
279 }
280 if (DumpCodeInstEmitter) {
281 // Disassemble function name label to text.
282 DisasmLines.push_back(MF->getName().str() + ":");
283 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
284 HexLines.emplace_back("");
285 }
286
288}
289
291 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
292 // Write a line for the basic block label if it is not only fallthrough.
293 DisasmLines.push_back(
294 (Twine("BB") + Twine(getFunctionNumber())
295 + "_" + Twine(MBB.getNumber()) + ":").str());
296 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
297 HexLines.emplace_back("");
298 }
300}
301
304 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
306 Twine(GV->getName()) +
307 ": unsupported initializer for address space");
308 return;
309 }
310
311 // LDS variables aren't emitted in HSA or PAL yet.
313 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
314 return;
315
316 MCSymbol *GVSym = getSymbol(GV);
317
318 GVSym->redefineIfPossible();
319 if (GVSym->isDefined() || GVSym->isVariable())
320 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
321 "' is already defined");
322
323 const DataLayout &DL = GV->getDataLayout();
324 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
325 Align Alignment = GV->getAlign().value_or(Align(4));
326
327 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
328 emitLinkage(GV, GVSym);
329 auto *TS = getTargetStreamer();
330 TS->emitAMDGPULDS(GVSym, Size, Alignment);
331 return;
332 }
333
335}
336
338 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
339
341 switch (CodeObjectVersion) {
343 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
344 break;
346 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
347 break;
349 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
350 break;
351 default:
352 report_fatal_error("Unexpected code object version");
353 }
354 }
355
357}
358
359void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
360 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
361 return;
362
365 MCSymbol *FnSym = TM.getSymbol(&F);
366 bool IsLocal = F.hasLocalLinkage();
367
368 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
369 int64_t Val;
370 if (Value->evaluateAsAbsolute(Val)) {
371 Res = Val;
372 return true;
373 }
374 return false;
375 };
376
377 const uint64_t MaxScratchPerWorkitem =
379 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
380 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
381 uint64_t ScratchSize;
382 if (ScratchSizeSymbol->isVariable() &&
383 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
384 ScratchSize > MaxScratchPerWorkitem) {
385 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
386 DS_Error);
387 F.getContext().diagnose(DiagStackSize);
388 }
389
390 // Validate addressable scalar registers (i.e., prior to added implicit
391 // SGPRs).
392 MCSymbol *NumSGPRSymbol =
393 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
395 !STM.hasSGPRInitBug()) {
396 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
397 uint64_t NumSgpr;
398 if (NumSGPRSymbol->isVariable() &&
399 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
400 NumSgpr > MaxAddressableNumSGPRs) {
401 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
402 NumSgpr, MaxAddressableNumSGPRs,
404 F.getContext().diagnose(Diag);
405 return;
406 }
407 }
408
409 MCSymbol *VCCUsedSymbol =
410 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
411 MCSymbol *FlatUsedSymbol = RI.getSymbol(
412 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
413 uint64_t VCCUsed, FlatUsed, NumSgpr;
414
415 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
416 FlatUsedSymbol->isVariable() &&
417 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
418 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
419 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
420
421 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
422 // resolvable.
423 NumSgpr += IsaInfo::getNumExtraSGPRs(
424 &STM, VCCUsed, FlatUsed,
425 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
427 STM.hasSGPRInitBug()) {
428 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
429 if (NumSgpr > MaxAddressableNumSGPRs) {
430 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
431 MaxAddressableNumSGPRs, DS_Error,
433 F.getContext().diagnose(Diag);
434 return;
435 }
436 }
437
438 MCSymbol *NumVgprSymbol =
439 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
440 MCSymbol *NumAgprSymbol =
441 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
442 uint64_t NumVgpr, NumAgpr;
443
445 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
447 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
448 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
449 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
451 unsigned MaxWaves = MFI.getMaxWavesPerEU();
452 uint64_t TotalNumVgpr =
453 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
454 uint64_t NumVGPRsForWavesPerEU = std::max(
455 {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
456 uint64_t NumSGPRsForWavesPerEU = std::max(
457 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
458 const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
459 STM.getOccupancyWithWorkGroupSizes(*MF).second,
460 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
461 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
462 OutContext);
463 uint64_t Occupancy;
464
465 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
466 F, "amdgpu-waves-per-eu", {0, 0}, true);
467
468 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
470 F, F.getSubprogram(),
471 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
472 "'" +
473 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
474 ", final occupancy is " + Twine(Occupancy));
475 F.getContext().diagnose(Diag);
476 return;
477 }
478 }
479 }
480}
481
483 // Pad with s_code_end to help tools and guard against instruction prefetch
484 // causing stale data in caches. Arguably this should be done by the linker,
485 // which is why this isn't done for Mesa.
486 const MCSubtargetInfo &STI = *getGlobalSTI();
487 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
490 OutStreamer->switchSection(getObjFileLowering().getTextSection());
492 }
493
494 // Assign expressions which can only be resolved when all other functions are
495 // known.
497
498 // Switch section and emit all GPR maximums within the processed module.
499 OutStreamer->pushSection();
500 MCSectionELF *MaxGPRSection =
501 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
502 OutStreamer->switchSection(MaxGPRSection);
506 OutStreamer->popSection();
507
508 for (Function &F : M.functions())
509 validateMCResourceInfo(F);
510
511 RI.reset();
512
514}
515
516SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
518 raw_svector_ostream OSS(Str);
519 auto &Streamer = getTargetStreamer()->getStreamer();
520 auto &Context = Streamer.getContext();
521 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
522 printAMDGPUMCExpr(New, OSS, MAI);
523 return Str;
524}
525
526// Print comments that apply to both callable functions and entry points.
527void AMDGPUAsmPrinter::emitCommonFunctionComments(
528 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
529 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
530 const AMDGPUMachineFunction *MFI) {
531 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
532 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
533 false);
534 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
535 if (NumAGPR && TotalNumVGPR) {
536 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
537 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
538 false);
539 }
540 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
541 false);
542 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
543 false);
544}
545
546const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
547 const MachineFunction &MF) const {
549 MCContext &Ctx = MF.getContext();
550 uint16_t KernelCodeProperties = 0;
551 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
552
553 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
554 KernelCodeProperties |=
555 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
556 }
557 if (UserSGPRInfo.hasDispatchPtr()) {
558 KernelCodeProperties |=
559 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
560 }
561 if (UserSGPRInfo.hasQueuePtr()) {
562 KernelCodeProperties |=
563 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
564 }
565 if (UserSGPRInfo.hasKernargSegmentPtr()) {
566 KernelCodeProperties |=
567 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
568 }
569 if (UserSGPRInfo.hasDispatchID()) {
570 KernelCodeProperties |=
571 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
572 }
573 if (UserSGPRInfo.hasFlatScratchInit()) {
574 KernelCodeProperties |=
575 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
576 }
577 if (UserSGPRInfo.hasPrivateSegmentSize()) {
578 KernelCodeProperties |=
579 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
580 }
582 KernelCodeProperties |=
583 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
584 }
585
586 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
587 // un-evaluatable at this point so it cannot be conditionally checked here.
588 // Instead, we'll directly shift the possibly unknown MCExpr into its place
589 // and bitwise-or it into KernelCodeProperties.
590 const MCExpr *KernelCodePropExpr =
591 MCConstantExpr::create(KernelCodeProperties, Ctx);
592 const MCExpr *OrValue = MCConstantExpr::create(
593 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
594 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
595 OrValue, Ctx);
596 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
597
598 return KernelCodePropExpr;
599}
600
602AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
603 const SIProgramInfo &PI) const {
605 const Function &F = MF.getFunction();
607 MCContext &Ctx = MF.getContext();
608
609 MCKernelDescriptor KernelDescriptor;
610
611 KernelDescriptor.group_segment_fixed_size =
613 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
614
615 Align MaxKernArgAlign;
616 KernelDescriptor.kernarg_size = MCConstantExpr::create(
617 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
618
619 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
620 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
621 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
622
623 int64_t PGRM_Rsrc3 = 1;
624 bool EvaluatableRsrc3 =
625 CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
626 (void)PGRM_Rsrc3;
627 (void)EvaluatableRsrc3;
628 assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
629 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
630 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
631
632 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
633 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
634 Ctx);
635
636 return KernelDescriptor;
637}
638
640 // Init target streamer lazily on the first function so that previous passes
641 // can set metadata.
643 initTargetStreamer(*MF.getFunction().getParent());
644
645 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
646 CurrentProgramInfo.reset(MF);
647
649 MCContext &Ctx = MF.getContext();
650
651 // The starting address of all shader programs must be 256 bytes aligned.
652 // Regular functions just need the basic required instruction alignment.
653 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
654
656
659 bool IsLocal = MF.getFunction().hasLocalLinkage();
660 // FIXME: This should be an explicit check for Mesa.
661 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
662 MCSectionELF *ConfigSection =
663 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
664 OutStreamer->switchSection(ConfigSection);
665 }
666
668 ResourceUsage->getResourceInfo();
670
671 if (MFI->isModuleEntryFunction()) {
672 getSIProgramInfo(CurrentProgramInfo, MF);
673 }
674
675 if (STM.isAmdPalOS()) {
676 if (MFI->isEntryFunction())
677 EmitPALMetadata(MF, CurrentProgramInfo);
678 else if (MFI->isModuleEntryFunction())
679 emitPALFunctionMetadata(MF);
680 } else if (!STM.isAmdHsaOS()) {
681 EmitProgramInfoSI(MF, CurrentProgramInfo);
682 }
683
684 DumpCodeInstEmitter = nullptr;
685 if (STM.dumpCode()) {
686 // For -dumpcode, get the assembler out of the streamer. This only works
687 // with -filetype=obj.
688 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
689 if (Assembler)
690 DumpCodeInstEmitter = Assembler->getEmitterPtr();
691 }
692
693 DisasmLines.clear();
694 HexLines.clear();
696
698
699 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
700 STM.hasMAIInsts());
701
702 {
705 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
706 IsLocal),
707 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
708 IsLocal),
709 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
710 IsLocal),
711 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
712 OutContext, IsLocal),
713 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
714 IsLocal),
715 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
716 OutContext, IsLocal),
717 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
718 OutContext, IsLocal),
719 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
720 IsLocal),
721 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
722 OutContext, IsLocal));
723 }
724
725 if (isVerbose()) {
726 MCSectionELF *CommentSection =
727 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
728 OutStreamer->switchSection(CommentSection);
729
730 if (!MFI->isEntryFunction()) {
732 OutStreamer->emitRawComment(" Function info:", false);
733
734 emitCommonFunctionComments(
735 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
736 IsLocal)
738 STM.hasMAIInsts()
739 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
740 OutContext, IsLocal)
742 : nullptr,
743 RI.createTotalNumVGPRs(MF, Ctx),
745 MF,
747 Ctx),
748 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
749 OutContext, IsLocal)
751 getFunctionCodeSize(MF), MFI);
752 return false;
753 }
754
755 OutStreamer->emitRawComment(" Kernel info:", false);
756 emitCommonFunctionComments(
757 CurrentProgramInfo.NumArchVGPR,
758 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
759 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
760 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
761
762 OutStreamer->emitRawComment(
763 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
764 OutStreamer->emitRawComment(
765 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
766 OutStreamer->emitRawComment(
767 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
768 " bytes/workgroup (compile time only)", false);
769
770 OutStreamer->emitRawComment(
771 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
772
773 OutStreamer->emitRawComment(
774 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
775
776 OutStreamer->emitRawComment(
777 " NumSGPRsForWavesPerEU: " +
778 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
779 false);
780 OutStreamer->emitRawComment(
781 " NumVGPRsForWavesPerEU: " +
782 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
783 false);
784
785 if (STM.hasGFX90AInsts()) {
786 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
787 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
788 AdjustedAccum = MCBinaryExpr::createMul(
789 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
790 OutStreamer->emitRawComment(
791 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
792 }
793
794 OutStreamer->emitRawComment(
795 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
796
797 OutStreamer->emitRawComment(
798 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
799
800 OutStreamer->emitRawComment(
801 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
802 getMCExprStr(CurrentProgramInfo.ScratchEnable),
803 false);
804 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
805 Twine(CurrentProgramInfo.UserSGPR),
806 false);
807 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
808 Twine(CurrentProgramInfo.TrapHandlerEnable),
809 false);
810 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
811 Twine(CurrentProgramInfo.TGIdXEnable),
812 false);
813 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
814 Twine(CurrentProgramInfo.TGIdYEnable),
815 false);
816 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
817 Twine(CurrentProgramInfo.TGIdZEnable),
818 false);
819 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
820 Twine(CurrentProgramInfo.TIdIGCompCount),
821 false);
822
823 [[maybe_unused]] int64_t PGMRSrc3;
824 assert(STM.hasGFX90AInsts() ||
825 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
826 PGMRSrc3) &&
827 static_cast<uint64_t>(PGMRSrc3) == 0));
828 if (STM.hasGFX90AInsts()) {
829 OutStreamer->emitRawComment(
830 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
831 getMCExprStr(MCKernelDescriptor::bits_get(
832 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
833 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
834 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
835 false);
836 OutStreamer->emitRawComment(
837 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
838 getMCExprStr(MCKernelDescriptor::bits_get(
839 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
840 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
841 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
842 false);
843 }
844 }
845
846 if (DumpCodeInstEmitter) {
847
848 OutStreamer->switchSection(
849 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
850
851 for (size_t i = 0; i < DisasmLines.size(); ++i) {
852 std::string Comment = "\n";
853 if (!HexLines[i].empty()) {
854 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
855 Comment += " ; " + HexLines[i] + "\n";
856 }
857
858 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
859 OutStreamer->emitBytes(StringRef(Comment));
860 }
861 }
862
863 return false;
864}
865
866// TODO: Fold this into emitFunctionBodyStart.
867void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
868 // In the beginning all features are either 'Any' or 'NotSupported',
869 // depending on global target features. This will cover empty modules.
871 getGlobalSTI()->getFeatureString());
872
873 // If module is empty, we are done.
874 if (M.empty())
875 return;
876
877 // If module is not empty, need to find first 'Off' or 'On' feature
878 // setting per feature from functions in module.
879 for (auto &F : M) {
880 auto &TSTargetID = getTargetStreamer()->getTargetID();
881 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
882 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
883 break;
884
886 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
887 if (TSTargetID->isXnackSupported())
888 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
889 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
890 if (TSTargetID->isSramEccSupported())
891 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
892 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
893 }
894}
895
896uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
898 const SIInstrInfo *TII = STM.getInstrInfo();
899
900 uint64_t CodeSize = 0;
901
902 for (const MachineBasicBlock &MBB : MF) {
903 for (const MachineInstr &MI : MBB) {
904 // TODO: CodeSize should account for multiple functions.
905
906 // TODO: Should we count size of debug info?
907 if (MI.isDebugInstr())
908 continue;
909
910 CodeSize += TII->getInstSizeInBytes(MI);
911 }
912 }
913
914 return CodeSize;
915}
916
917// AccumOffset computed for the MCExpr equivalent of:
918// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
919static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
920 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
921 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
922
923 // Can't be lower than 1 for subsequent alignTo.
924 const MCExpr *MaximumTaken =
925 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
926
927 // Practically, it's computing divideCeil(MaximumTaken, 4).
928 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
929 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
930 Ctx);
931
932 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
933}
934
935void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
936 const MachineFunction &MF) {
938 bool IsLocal = MF.getFunction().hasLocalLinkage();
939 MCContext &Ctx = MF.getContext();
940
941 auto CreateExpr = [&Ctx](int64_t Value) {
942 return MCConstantExpr::create(Value, Ctx);
943 };
944
945 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
946 int64_t Val;
947 if (Value->evaluateAsAbsolute(Val)) {
948 Res = Val;
949 return true;
950 }
951 return false;
952 };
953
954 auto GetSymRefExpr =
955 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
956 MCSymbol *Sym =
957 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
958 return MCSymbolRefExpr::create(Sym, Ctx);
959 };
960
962 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
963 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
965 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
966
967 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
968 ProgInfo.TgSplit = STM.isTgSplitEnabled();
969 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
970 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
971 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
972 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
973 ProgInfo.DynamicCallStack =
974 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
975 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
976
978
979 // The calculations related to SGPR/VGPR blocks are
980 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
981 // unified.
982 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
983 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
984 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
985
986 // Check the addressable register limit before we add ExtraSGPRs.
988 !STM.hasSGPRInitBug()) {
989 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
990 uint64_t NumSgpr;
991 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
992 NumSgpr > MaxAddressableNumSGPRs) {
993 // This can happen due to a compiler bug or when using inline asm.
996 MF.getFunction(), "addressable scalar registers", NumSgpr,
997 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
998 Ctx.diagnose(Diag);
999 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1000 }
1001 }
1002
1003 // Account for extra SGPRs and VGPRs reserved for debugger use.
1004 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1005
1006 const Function &F = MF.getFunction();
1007
1008 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1009 // dispatch registers are function args.
1010 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1011
1012 if (isShader(F.getCallingConv())) {
1013 bool IsPixelShader =
1014 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
1015
1016 // Calculate the number of VGPR registers based on the SPI input registers
1017 uint32_t InputEna = 0;
1018 uint32_t InputAddr = 0;
1019 unsigned LastEna = 0;
1020
1021 if (IsPixelShader) {
1022 // Note for IsPixelShader:
1023 // By this stage, all enabled inputs are tagged in InputAddr as well.
1024 // We will use InputAddr to determine whether the input counts against the
1025 // vgpr total and only use the InputEnable to determine the last input
1026 // that is relevant - if extra arguments are used, then we have to honour
1027 // the InputAddr for any intermediate non-enabled inputs.
1028 InputEna = MFI->getPSInputEnable();
1029 InputAddr = MFI->getPSInputAddr();
1030
1031 // We only need to consider input args up to the last used arg.
1032 assert((InputEna || InputAddr) &&
1033 "PSInputAddr and PSInputEnable should "
1034 "never both be 0 for AMDGPU_PS shaders");
1035 // There are some rare circumstances where InputAddr is non-zero and
1036 // InputEna can be set to 0. In this case we default to setting LastEna
1037 // to 1.
1038 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
1039 }
1040
1041 // FIXME: We should be using the number of registers determined during
1042 // calling convention lowering to legalize the types.
1043 const DataLayout &DL = F.getDataLayout();
1044 unsigned PSArgCount = 0;
1045 unsigned IntermediateVGPR = 0;
1046 for (auto &Arg : F.args()) {
1047 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1048 if (Arg.hasAttribute(Attribute::InReg)) {
1049 WaveDispatchNumSGPR += NumRegs;
1050 } else {
1051 // If this is a PS shader and we're processing the PS Input args (first
1052 // 16 VGPR), use the InputEna and InputAddr bits to define how many
1053 // VGPRs are actually used.
1054 // Any extra VGPR arguments are handled as normal arguments (and
1055 // contribute to the VGPR count whether they're used or not).
1056 if (IsPixelShader && PSArgCount < 16) {
1057 if ((1 << PSArgCount) & InputAddr) {
1058 if (PSArgCount < LastEna)
1059 WaveDispatchNumVGPR += NumRegs;
1060 else
1061 IntermediateVGPR += NumRegs;
1062 }
1063 PSArgCount++;
1064 } else {
1065 // If there are extra arguments we have to include the allocation for
1066 // the non-used (but enabled with InputAddr) input arguments
1067 if (IntermediateVGPR) {
1068 WaveDispatchNumVGPR += IntermediateVGPR;
1069 IntermediateVGPR = 0;
1070 }
1071 WaveDispatchNumVGPR += NumRegs;
1072 }
1073 }
1074 }
1076 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
1077
1079 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1080
1082 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1083 } else if (isKernel(F.getCallingConv()) &&
1085 // Consider cases where the total number of UserSGPRs with trailing
1086 // allocated preload SGPRs, is greater than the number of explicitly
1087 // referenced SGPRs.
1088 const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1089 CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1090 ProgInfo.NumSGPR =
1091 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
1092 }
1093
1094 // Adjust number of registers used to meet default/requested minimum/maximum
1095 // number of waves per execution unit request.
1096 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1097 ProgInfo.NumSGPRsForWavesPerEU =
1098 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1099 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1100 Ctx);
1101 ProgInfo.NumVGPRsForWavesPerEU =
1102 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1103 CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
1104 Ctx);
1105
1107 STM.hasSGPRInitBug()) {
1108 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1109 uint64_t NumSgpr;
1110 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1111 NumSgpr > MaxAddressableNumSGPRs) {
1112 // This can happen due to a compiler bug or when using inline asm to use
1113 // the registers which are usually reserved for vcc etc.
1115 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1116 NumSgpr, MaxAddressableNumSGPRs,
1118 Ctx.diagnose(Diag);
1119 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1120 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1121 }
1122 }
1123
1124 if (STM.hasSGPRInitBug()) {
1125 ProgInfo.NumSGPR =
1127 ProgInfo.NumSGPRsForWavesPerEU =
1129 }
1130
1131 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1133 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1134 MFI->getNumUserSGPRs(),
1136 Ctx.diagnose(Diag);
1137 }
1138
1139 if (MFI->getLDSSize() >
1140 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
1143 MF.getFunction(), "local memory", MFI->getLDSSize(),
1145 Ctx.diagnose(Diag);
1146 }
1147 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1148 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1149 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1150 unsigned Granule) {
1151 const MCExpr *OneConst = CreateExpr(1ul);
1152 const MCExpr *GranuleConst = CreateExpr(Granule);
1153 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1154 const MCExpr *AlignToGPR =
1155 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1156 const MCExpr *DivGPR =
1157 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1158 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1159 return SubGPR;
1160 };
1161
1162 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1164 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1166
1167 const SIModeRegisterDefaults Mode = MFI->getMode();
1168
1169 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1170 // register.
1171 ProgInfo.FloatMode = getFPMode(Mode);
1172
1173 ProgInfo.IEEEMode = Mode.IEEE;
1174
1175 // Make clamp modifier on NaN input returns 0.
1176 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1177
1178 unsigned LDSAlignShift;
1179 if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
1180 // LDS is allocated in 320 dword blocks.
1181 LDSAlignShift = 11;
1182 } else if (STM.getFeatureBits().test(
1183 FeatureAddressableLocalMemorySize65536)) {
1184 // LDS is allocated in 128 dword blocks.
1185 LDSAlignShift = 9;
1186 } else {
1187 // LDS is allocated in 64 dword blocks.
1188 LDSAlignShift = 8;
1189 }
1190
1191 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1192 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1193
1194 ProgInfo.LDSSize = MFI->getLDSSize();
1195 ProgInfo.LDSBlocks =
1196 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1197
1198 // The MCExpr equivalent of divideCeil.
1199 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1200 const MCExpr *Ceil =
1201 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1202 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1203 };
1204
1205 // Scratch is allocated in 64-dword or 256-dword blocks.
1206 unsigned ScratchAlignShift =
1207 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1208 // We need to program the hardware with the amount of scratch memory that
1209 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1210 // scratch memory used per thread.
1211 ProgInfo.ScratchBlocks = DivideCeil(
1213 CreateExpr(STM.getWavefrontSize()), Ctx),
1214 CreateExpr(1ULL << ScratchAlignShift));
1215
1216 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1217 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1218 ProgInfo.MemOrdered = 1;
1219 }
1220
1221 // 0 = X, 1 = XY, 2 = XYZ
1222 unsigned TIDIGCompCnt = 0;
1223 if (MFI->hasWorkItemIDZ())
1224 TIDIGCompCnt = 2;
1225 else if (MFI->hasWorkItemIDY())
1226 TIDIGCompCnt = 1;
1227
1228 // The private segment wave byte offset is the last of the system SGPRs. We
1229 // initially assumed it was allocated, and may have used it. It shouldn't harm
1230 // anything to disable it if we know the stack isn't used here. We may still
1231 // have emitted code reading it to initialize scratch, but if that's unused
1232 // reading garbage should be OK.
1235 MCConstantExpr::create(0, Ctx), Ctx),
1236 ProgInfo.DynamicCallStack, Ctx);
1237
1238 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1239 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1240 ProgInfo.TrapHandlerEnable =
1241 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1242 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1243 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1244 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1245 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1246 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1247 ProgInfo.EXCPEnMSB = 0;
1248 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1249 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1250 ProgInfo.EXCPEnable = 0;
1251
1252 if (STM.hasGFX90AInsts()) {
1253 // return ((Dst & ~Mask) | (Value << Shift))
1254 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1255 uint32_t Shift) {
1256 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1257 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1258 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1260 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1261 return Dst;
1262 };
1263
1264 ProgInfo.ComputePGMRSrc3GFX90A =
1265 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1266 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1267 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1268 ProgInfo.ComputePGMRSrc3GFX90A =
1269 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1270 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1271 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1272 }
1273
1275 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1276 ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1277
1278 const auto [MinWEU, MaxWEU] =
1279 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1280 uint64_t Occupancy;
1281 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1283 F, F.getSubprogram(),
1284 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1285 "'" +
1286 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1287 ", final occupancy is " + Twine(Occupancy));
1288 F.getContext().diagnose(Diag);
1289 }
1290}
1291
1292static unsigned getRsrcReg(CallingConv::ID CallConv) {
1293 switch (CallConv) {
1294 default: [[fallthrough]];
1302 }
1303}
1304
1305void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1306 const SIProgramInfo &CurrentProgramInfo) {
1308 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1309 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1310 MCContext &Ctx = MF.getContext();
1311
1312 // (((Value) & Mask) << Shift)
1313 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1314 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1315 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1317 shft, Ctx);
1318 };
1319
1320 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1321 int64_t Val;
1322 if (Value->evaluateAsAbsolute(Val))
1323 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1324 else
1325 OutStreamer->emitValue(Value, Size);
1326 };
1327
1330
1331 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1332 /*Size=*/4);
1333
1335 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1336
1338
1339 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1340 // appropriate generation.
1341 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1342 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1343 /*Mask=*/0x3FFFF, /*Shift=*/12),
1344 /*Size=*/4);
1345 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1346 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1347 /*Mask=*/0x7FFF, /*Shift=*/12),
1348 /*Size=*/4);
1349 } else {
1350 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1351 /*Mask=*/0x1FFF, /*Shift=*/12),
1352 /*Size=*/4);
1353 }
1354
1355 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1356 // 0" comment but I don't see a corresponding field in the register spec.
1357 } else {
1358 OutStreamer->emitInt32(RsrcReg);
1359
1360 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1361 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1362 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1363 MF.getContext());
1364 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1366
1367 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1368 // appropriate generation.
1369 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1370 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1371 /*Mask=*/0x3FFFF, /*Shift=*/12),
1372 /*Size=*/4);
1373 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1374 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1375 /*Mask=*/0x7FFF, /*Shift=*/12),
1376 /*Size=*/4);
1377 } else {
1378 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1379 /*Mask=*/0x1FFF, /*Shift=*/12),
1380 /*Size=*/4);
1381 }
1382 }
1383
1386 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1387 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1388 : CurrentProgramInfo.LDSBlocks;
1389 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1391 OutStreamer->emitInt32(MFI->getPSInputEnable());
1393 OutStreamer->emitInt32(MFI->getPSInputAddr());
1394 }
1395
1396 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1397 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1398 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1399 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1400}
1401
1402// Helper function to add common PAL Metadata 3.0+
1404 const SIProgramInfo &CurrentProgramInfo,
1405 CallingConv::ID CC, const GCNSubtarget &ST) {
1406 if (ST.hasIEEEMode())
1407 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1408
1409 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1410 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1411
1412 if (AMDGPU::isCompute(CC)) {
1413 MD->setHwStage(CC, ".trap_present",
1414 (bool)CurrentProgramInfo.TrapHandlerEnable);
1415 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1416 }
1417
1418 MD->setHwStage(CC, ".lds_size",
1419 (unsigned)(CurrentProgramInfo.LdsSize *
1420 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1421}
1422
1423// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1424// is AMDPAL. It stores each compute/SPI register setting and other PAL
1425// metadata items into the PALMD::Metadata, combining with any provided by the
1426// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1427// is then written as a single block in the .note section.
1428void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1429 const SIProgramInfo &CurrentProgramInfo) {
1431 auto CC = MF.getFunction().getCallingConv();
1432 auto *MD = getTargetStreamer()->getPALMetadata();
1433 auto &Ctx = MF.getContext();
1434
1435 MD->setEntryPoint(CC, MF.getFunction().getName());
1436 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1437
1438 // Only set AGPRs for supported devices
1439 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1440 if (STM.hasMAIInsts()) {
1441 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1442 }
1443
1444 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1445 if (MD->getPALMajorVersion() < 3) {
1446 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1447 if (AMDGPU::isCompute(CC)) {
1448 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1449 } else {
1450 const MCExpr *HasScratchBlocks =
1451 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1452 MCConstantExpr::create(0, Ctx), Ctx);
1453 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1454 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1455 }
1456 } else {
1457 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1458 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1459 CurrentProgramInfo.ScratchEnable);
1460 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1461 }
1462
1463 // ScratchSize is in bytes, 16 aligned.
1464 MD->setScratchSize(
1465 CC,
1466 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1467 MCConstantExpr::create(16, Ctx), Ctx),
1468 Ctx);
1469
1471 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1472 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1473 : CurrentProgramInfo.LDSBlocks;
1474 if (MD->getPALMajorVersion() < 3) {
1475 MD->setRsrc2(
1476 CC,
1478 Ctx);
1479 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1480 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1481 } else {
1482 // Graphics registers
1483 const unsigned ExtraLdsDwGranularity =
1484 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1485 MD->setGraphicsRegisters(
1486 ".ps_extra_lds_size",
1487 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1488
1489 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1490 static StringLiteral const PsInputFields[] = {
1491 ".persp_sample_ena", ".persp_center_ena",
1492 ".persp_centroid_ena", ".persp_pull_model_ena",
1493 ".linear_sample_ena", ".linear_center_ena",
1494 ".linear_centroid_ena", ".line_stipple_tex_ena",
1495 ".pos_x_float_ena", ".pos_y_float_ena",
1496 ".pos_z_float_ena", ".pos_w_float_ena",
1497 ".front_face_ena", ".ancillary_ena",
1498 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1499 unsigned PSInputEna = MFI->getPSInputEnable();
1500 unsigned PSInputAddr = MFI->getPSInputAddr();
1501 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1502 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1503 (bool)((PSInputEna >> Idx) & 1));
1504 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1505 (bool)((PSInputAddr >> Idx) & 1));
1506 }
1507 }
1508 }
1509
1510 // For version 3 and above the wave front size is already set in the metadata
1511 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1512 MD->setWave32(MF.getFunction().getCallingConv());
1513}
1514
1515void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1516 auto *MD = getTargetStreamer()->getPALMetadata();
1517 const MachineFrameInfo &MFI = MF.getFrameInfo();
1518 StringRef FnName = MF.getFunction().getName();
1519 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1521 MCContext &Ctx = MF.getContext();
1522
1523 if (MD->getPALMajorVersion() < 3) {
1524 // Set compute registers
1525 MD->setRsrc1(
1527 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1528 MD->setRsrc2(CallingConv::AMDGPU_CS,
1529 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1530 } else {
1531 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1532 }
1533
1534 // Set optional info
1535 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1536 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1537 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1538}
1539
1540// This is supposed to be log2(Size)
1542 switch (Size) {
1543 case 4:
1544 return AMD_ELEMENT_4_BYTES;
1545 case 8:
1546 return AMD_ELEMENT_8_BYTES;
1547 case 16:
1548 return AMD_ELEMENT_16_BYTES;
1549 default:
1550 llvm_unreachable("invalid private_element_size");
1551 }
1552}
1553
1554void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1555 const SIProgramInfo &CurrentProgramInfo,
1556 const MachineFunction &MF) const {
1557 const Function &F = MF.getFunction();
1558 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1559 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1560
1562 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1563 MCContext &Ctx = MF.getContext();
1564
1565 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1566
1568 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1570 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1572
1573 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1574
1576 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1577
1578 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1579 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1581 }
1582
1583 if (UserSGPRInfo.hasDispatchPtr())
1585
1586 if (UserSGPRInfo.hasQueuePtr())
1588
1589 if (UserSGPRInfo.hasKernargSegmentPtr())
1591
1592 if (UserSGPRInfo.hasDispatchID())
1594
1595 if (UserSGPRInfo.hasFlatScratchInit())
1597
1598 if (UserSGPRInfo.hasPrivateSegmentSize())
1600
1601 if (STM.isXNACKEnabled())
1603
1604 Align MaxKernArgAlign;
1605 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1606 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1607 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1608 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1609 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1610
1611 // kernarg_segment_alignment is specified as log of the alignment.
1612 // The minimum alignment is 16.
1613 // FIXME: The metadata treats the minimum as 4?
1614 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1615}
1616
1618 const char *ExtraCode, raw_ostream &O) {
1619 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1620 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1621 return false;
1622
1623 if (ExtraCode && ExtraCode[0]) {
1624 if (ExtraCode[1] != 0)
1625 return true; // Unknown modifier.
1626
1627 switch (ExtraCode[0]) {
1628 case 'r':
1629 break;
1630 default:
1631 return true;
1632 }
1633 }
1634
1635 // TODO: Should be able to support other operand types like globals.
1636 const MachineOperand &MO = MI->getOperand(OpNo);
1637 if (MO.isReg()) {
1640 return false;
1641 }
1642 if (MO.isImm()) {
1643 int64_t Val = MO.getImm();
1645 O << Val;
1646 } else if (isUInt<16>(Val)) {
1647 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1648 } else if (isUInt<32>(Val)) {
1649 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1650 } else {
1651 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1652 }
1653 return false;
1654 }
1655 return true;
1656}
1657
1664}
1665
1666void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1667 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1668 bool isModuleEntryFunction, bool hasMAIInsts) {
1669 if (!ORE)
1670 return;
1671
1672 const char *Name = "kernel-resource-usage";
1673 const char *Indent = " ";
1674
1675 // If the remark is not specifically enabled, do not output to yaml
1678 return;
1679
1680 // Currently non-kernel functions have no resources to emit.
1682 return;
1683
1684 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1685 StringRef RemarkLabel, auto Argument) {
1686 // Add an indent for every line besides the line with the kernel name. This
1687 // makes it easier to tell which resource usage go with which kernel since
1688 // the kernel name will always be displayed first.
1689 std::string LabelStr = RemarkLabel.str() + ": ";
1690 if (RemarkName != "FunctionName")
1691 LabelStr = Indent + LabelStr;
1692
1693 ORE->emit([&]() {
1694 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1696 &MF.front())
1697 << LabelStr << ore::NV(RemarkName, Argument);
1698 });
1699 };
1700
1701 // FIXME: Formatting here is pretty nasty because clang does not accept
1702 // newlines from diagnostics. This forces us to emit multiple diagnostic
1703 // remarks to simulate newlines. If and when clang does accept newlines, this
1704 // formatting should be aggregated into one remark with newlines to avoid
1705 // printing multiple diagnostic location and diag opts.
1706 EmitResourceUsageRemark("FunctionName", "Function Name",
1707 MF.getFunction().getName());
1708 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1709 getMCExprStr(CurrentProgramInfo.NumSGPR));
1710 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1711 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1712 if (hasMAIInsts) {
1713 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1714 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1715 }
1716 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1717 getMCExprStr(CurrentProgramInfo.ScratchSize));
1718 int64_t DynStack;
1719 bool DynStackEvaluatable =
1720 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1721 StringRef DynamicStackStr =
1722 DynStackEvaluatable && DynStack ? "True" : "False";
1723 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1724 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1725 getMCExprStr(CurrentProgramInfo.Occupancy));
1726 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1727 CurrentProgramInfo.SGPRSpill);
1728 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1729 CurrentProgramInfo.VGPRSpill);
1730 if (isModuleEntryFunction)
1731 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1732 CurrentProgramInfo.LDSSize);
1733}
#define Success
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:128
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1089
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1227
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1209
#define C_00B84C_SCRATCH_EN
Definition: SIDefines.h:1125
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1201
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1160
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1222
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1112
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1111
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1120
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1159
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1098
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1220
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1162
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1241
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1208
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1219
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1103
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1242
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1097
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1122
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1096
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:69
static const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:83
void setHwStage(unsigned CC, StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:87
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:408
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:697
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:719
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:90
const MCAsmInfo * MAI
Target Asm Printer information.
Definition: AsmPrinter.h:93
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:105
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:459
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:652
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:450
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:404
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:117
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition: AsmPrinter.h:124
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition: AsmPrinter.h:108
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:97
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:102
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:260
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:692
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1874
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:623
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:627
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:317
bool dumpCode() const
Definition: GCNSubtarget.h:523
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:331
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasPrivateSegmentSize() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:79
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:249
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
bool hasLocalLinkage() const
Definition: GlobalValue.h:529
unsigned getAddressSpace() const
Definition: GlobalValue.h:206
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:130
Type * getValueType() const
Definition: GlobalValue.h:297
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:186
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:602
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:572
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:592
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:557
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:547
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:607
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:622
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:222
Context object for machine code objects.
Definition: MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition: MCContext.h:416
MCSectionELF * getELFSection(const Twine &Section, unsigned Type, unsigned Flags)
Definition: MCContext.h:551
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1072
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:34
MCSection * getReadOnlySection() const
MCContext & getContext() const
void gatherResourceInfo(const MachineFunction &MF, const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI, MCContext &OutContext)
AMDGPUResourceUsageAnalysis gathers resource usage on a per-function granularity.
MCSymbol * getMaxSGPRSymbol(MCContext &OutContext)
MCSymbol * getMaxAGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx)
void finalize(MCContext &OutContext)
MCSymbol * getSymbol(StringRef FuncName, ResourceInfoKind RIK, MCContext &OutContext, bool IsLocal)
MCSymbol * getMaxVGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack, MCContext &Ctx)
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:27
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition: MCSection.h:150
MCContext & getContext() const
Definition: MCStreamer.h:300
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:398
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
const MCExpr * getVariableValue(bool SetUsed=true) const
getVariableValue - Get the value for variable symbols.
Definition: MCSymbol.h:305
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:102
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:467
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:71
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:229
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
bool isModuleEntryFunctionCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1098
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1377
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
const SIFunctionResourceInfo & getResourceInfo() const
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:31
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:70
const MCExpr * ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:63
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:66
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:33
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:48
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:90
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:74
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:53
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:51
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:68
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:67
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:87
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:34
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:80
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:65
const MCExpr * getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const
const MCExpr * Occupancy
Definition: SIProgramInfo.h:83
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:44
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:77
void reset(const MachineFunction &MF)
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.