LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
60 CodeGenCoverage *CoverageInfo,
62 BlockFrequencyInfo *BFI) {
63 MRI = &MF.getRegInfo();
64 Subtarget = &MF.getSubtarget<GCNSubtarget>();
67}
68
69// Return the wave level SGPR base address if this is a wave address.
71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 : Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77 const MachineRegisterInfo &MRI) const {
78 // The verifier is oblivious to s1 being a valid value for wavesize registers.
79 if (Reg.isPhysical())
80 return false;
81
82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83 const TargetRegisterClass *RC =
84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
85 if (RC) {
86 const LLT Ty = MRI.getType(Reg);
87 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88 return false;
89 // G_TRUNC s1 result is never vcc.
90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95 return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99 unsigned NewOpc) const {
100 MI.setDesc(TII.get(NewOpc));
101 MI.removeOperand(1); // Remove intrinsic ID.
102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103
104 MachineOperand &Dst = MI.getOperand(0);
105 MachineOperand &Src = MI.getOperand(1);
106
107 // TODO: This should be legalized to s32 if needed
108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109 return false;
110
111 const TargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113 const TargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115 if (!DstRC || DstRC != SrcRC)
116 return false;
117
118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123 const DebugLoc &DL = I.getDebugLoc();
124 MachineBasicBlock *BB = I.getParent();
125 I.setDesc(TII.get(TargetOpcode::COPY));
126
127 const MachineOperand &Src = I.getOperand(1);
128 MachineOperand &Dst = I.getOperand(0);
129 Register DstReg = Dst.getReg();
130 Register SrcReg = Src.getReg();
131
132 if (isVCC(DstReg, *MRI)) {
133 if (SrcReg == AMDGPU::SCC) {
134 const TargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136 if (!RC)
137 return true;
138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141 if (!isVCC(SrcReg, *MRI)) {
142 // TODO: Should probably leave the copy and let copyPhysReg expand it.
143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144 return false;
145
146 const TargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151 if (ConstVal) {
152 unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 } else {
157 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158
159 // We can't trust the high bits at this point, so clear them.
160
161 // TODO: Skip masking high bits if def is known boolean.
162
163 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
164 assert(Subtarget->useRealTrue16Insts());
165 const int64_t NoMods = 0;
166 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
167 .addImm(NoMods)
168 .addImm(1)
169 .addImm(NoMods)
170 .addReg(SrcReg)
171 .addImm(NoMods);
172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
173 .addImm(NoMods)
174 .addImm(0)
175 .addImm(NoMods)
176 .addReg(MaskedReg)
177 .addImm(NoMods);
178 } else {
179 bool IsSGPR = TRI.isSGPRClass(SrcRC);
180 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
181 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
182 .addImm(1)
183 .addReg(SrcReg);
184 if (IsSGPR)
185 And.setOperandDead(3); // Dead scc
186
187 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
188 .addImm(0)
189 .addReg(MaskedReg);
190 }
191 }
192
193 if (!MRI->getRegClassOrNull(SrcReg))
194 MRI->setRegClass(SrcReg, SrcRC);
195 I.eraseFromParent();
196 return true;
197 }
198
199 const TargetRegisterClass *RC =
201 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
202 return false;
203
204 return true;
205 }
206
207 for (const MachineOperand &MO : I.operands()) {
208 if (MO.getReg().isPhysical())
209 continue;
210
211 const TargetRegisterClass *RC =
213 if (!RC)
214 continue;
215 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
216 }
217 return true;
218}
219
220bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
221 const DebugLoc &DL = I.getDebugLoc();
222 MachineBasicBlock *BB = I.getParent();
223
224 unsigned CmpOpc =
225 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226 MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
227 .addReg(I.getOperand(1).getReg())
228 .addImm(0);
229 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230 return false;
231
232 Register DstReg = I.getOperand(0).getReg();
233 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234
235 I.eraseFromParent();
236 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
237}
238
239bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
240 const DebugLoc &DL = I.getDebugLoc();
241 MachineBasicBlock *BB = I.getParent();
242
243 Register DstReg = I.getOperand(0).getReg();
244 Register SrcReg = I.getOperand(1).getReg();
245 std::optional<ValueAndVReg> Arg =
246 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247
248 if (Arg) {
249 const int64_t Value = Arg->Value.getZExtValue();
250 if (Value == 0) {
251 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
253 } else {
254 assert(Value == 1);
255 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256 }
257 I.eraseFromParent();
258 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259 }
260
261 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
262 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263
264 unsigned SelectOpcode =
265 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
267 .addReg(TRI.getExec())
268 .addImm(0);
269
270 I.eraseFromParent();
271 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272}
273
274bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
275 Register DstReg = I.getOperand(0).getReg();
276 Register SrcReg = I.getOperand(1).getReg();
277
278 const DebugLoc &DL = I.getDebugLoc();
279 MachineBasicBlock *BB = I.getParent();
280
281 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282 .addReg(SrcReg);
283
284 I.eraseFromParent();
285 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286}
287
288bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
289 const Register DefReg = I.getOperand(0).getReg();
290 const LLT DefTy = MRI->getType(DefReg);
291
292 // S1 G_PHIs should not be selected in instruction-select, instead:
293 // - divergent S1 G_PHI should go through lane mask merging algorithm
294 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
295 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
296 if (DefTy == LLT::scalar(1))
297 return false;
298
299 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
300
301 const RegClassOrRegBank &RegClassOrBank =
302 MRI->getRegClassOrRegBank(DefReg);
303
304 const TargetRegisterClass *DefRC =
305 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
306 if (!DefRC) {
307 if (!DefTy.isValid()) {
308 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
309 return false;
310 }
311
312 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
313 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
314 if (!DefRC) {
315 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
316 return false;
317 }
318 }
319
320 // If inputs have register bank, assign corresponding reg class.
321 // Note: registers don't need to have the same reg bank.
322 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
323 const Register SrcReg = I.getOperand(i).getReg();
324
325 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
326 if (RB) {
327 const LLT SrcTy = MRI->getType(SrcReg);
328 const TargetRegisterClass *SrcRC =
329 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331 return false;
332 }
333 }
334
335 I.setDesc(TII.get(TargetOpcode::PHI));
336 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
337}
338
340AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
341 const TargetRegisterClass &SubRC,
342 unsigned SubIdx) const {
343
344 MachineInstr *MI = MO.getParent();
346 Register DstReg = MRI->createVirtualRegister(&SubRC);
347
348 if (MO.isReg()) {
349 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
350 Register Reg = MO.getReg();
351 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
352 .addReg(Reg, 0, ComposedSubIdx);
353
354 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
355 MO.isKill(), MO.isDead(), MO.isUndef(),
356 MO.isEarlyClobber(), 0, MO.isDebug(),
357 MO.isInternalRead());
358 }
359
360 assert(MO.isImm());
361
362 APInt Imm(64, MO.getImm());
363
364 switch (SubIdx) {
365 default:
366 llvm_unreachable("do not know to split immediate with this sub index.");
367 case AMDGPU::sub0:
368 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
369 case AMDGPU::sub1:
370 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
371 }
372}
373
374static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
375 switch (Opc) {
376 case AMDGPU::G_AND:
377 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
378 case AMDGPU::G_OR:
379 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
380 case AMDGPU::G_XOR:
381 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
382 default:
383 llvm_unreachable("not a bit op");
384 }
385}
386
387bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
388 Register DstReg = I.getOperand(0).getReg();
389 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
390
391 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
392 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
393 DstRB->getID() != AMDGPU::VCCRegBankID)
394 return false;
395
396 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
397 STI.isWave64());
398 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
399
400 // Dead implicit-def of scc
401 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
402 true, // isImp
403 false, // isKill
404 true)); // isDead
405 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
406}
407
408bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
409 MachineBasicBlock *BB = I.getParent();
411 Register DstReg = I.getOperand(0).getReg();
412 const DebugLoc &DL = I.getDebugLoc();
413 LLT Ty = MRI->getType(DstReg);
414 if (Ty.isVector())
415 return false;
416
417 unsigned Size = Ty.getSizeInBits();
418 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
419 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
420 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
421
422 if (Size == 32) {
423 if (IsSALU) {
424 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
426 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
427 .add(I.getOperand(1))
428 .add(I.getOperand(2))
429 .setOperandDead(3); // Dead scc
430 I.eraseFromParent();
431 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
432 }
433
434 if (STI.hasAddNoCarry()) {
435 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
436 I.setDesc(TII.get(Opc));
437 I.addOperand(*MF, MachineOperand::CreateImm(0));
438 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
439 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
440 }
441
442 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
443
444 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
446 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
447 .addDef(UnusedCarry, RegState::Dead)
448 .add(I.getOperand(1))
449 .add(I.getOperand(2))
450 .addImm(0);
451 I.eraseFromParent();
452 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
453 }
454
455 assert(!Sub && "illegal sub should not reach here");
456
457 const TargetRegisterClass &RC
458 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
459 const TargetRegisterClass &HalfRC
460 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
461
462 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
463 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
464 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
465 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
466
467 Register DstLo = MRI->createVirtualRegister(&HalfRC);
468 Register DstHi = MRI->createVirtualRegister(&HalfRC);
469
470 if (IsSALU) {
471 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
472 .add(Lo1)
473 .add(Lo2);
474 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
475 .add(Hi1)
476 .add(Hi2)
477 .setOperandDead(3); // Dead scc
478 } else {
479 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
480 Register CarryReg = MRI->createVirtualRegister(CarryRC);
481 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
482 .addDef(CarryReg)
483 .add(Lo1)
484 .add(Lo2)
485 .addImm(0);
486 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
487 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
488 .add(Hi1)
489 .add(Hi2)
490 .addReg(CarryReg, RegState::Kill)
491 .addImm(0);
492
493 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
494 return false;
495 }
496
497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
498 .addReg(DstLo)
499 .addImm(AMDGPU::sub0)
500 .addReg(DstHi)
501 .addImm(AMDGPU::sub1);
502
503
504 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
505 return false;
506
507 I.eraseFromParent();
508 return true;
509}
510
511bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
512 MachineInstr &I) const {
513 MachineBasicBlock *BB = I.getParent();
515 const DebugLoc &DL = I.getDebugLoc();
516 Register Dst0Reg = I.getOperand(0).getReg();
517 Register Dst1Reg = I.getOperand(1).getReg();
518 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
519 I.getOpcode() == AMDGPU::G_UADDE;
520 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
521 I.getOpcode() == AMDGPU::G_USUBE;
522
523 if (isVCC(Dst1Reg, *MRI)) {
524 unsigned NoCarryOpc =
525 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
526 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
527 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
528 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
529 I.addOperand(*MF, MachineOperand::CreateImm(0));
530 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
531 }
532
533 Register Src0Reg = I.getOperand(2).getReg();
534 Register Src1Reg = I.getOperand(3).getReg();
535
536 if (HasCarryIn) {
537 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
538 .addReg(I.getOperand(4).getReg());
539 }
540
541 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
542 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
543
544 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
545 .add(I.getOperand(2))
546 .add(I.getOperand(3));
547
548 if (MRI->use_nodbg_empty(Dst1Reg)) {
549 CarryInst.setOperandDead(3); // Dead scc
550 } else {
551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
552 .addReg(AMDGPU::SCC);
553 if (!MRI->getRegClassOrNull(Dst1Reg))
554 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
555 }
556
557 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
558 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
560 return false;
561
562 if (HasCarryIn &&
563 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
564 AMDGPU::SReg_32RegClass, *MRI))
565 return false;
566
567 I.eraseFromParent();
568 return true;
569}
570
571bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
572 MachineInstr &I) const {
573 MachineBasicBlock *BB = I.getParent();
575 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
576
577 unsigned Opc;
578 if (Subtarget->hasMADIntraFwdBug())
579 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
580 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
581 else
582 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
583 I.setDesc(TII.get(Opc));
584 I.addOperand(*MF, MachineOperand::CreateImm(0));
585 I.addImplicitDefUseOperands(*MF);
586 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
587}
588
589// TODO: We should probably legalize these to only using 32-bit results.
590bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
591 MachineBasicBlock *BB = I.getParent();
592 Register DstReg = I.getOperand(0).getReg();
593 Register SrcReg = I.getOperand(1).getReg();
594 LLT DstTy = MRI->getType(DstReg);
595 LLT SrcTy = MRI->getType(SrcReg);
596 const unsigned SrcSize = SrcTy.getSizeInBits();
597 unsigned DstSize = DstTy.getSizeInBits();
598
599 // TODO: Should handle any multiple of 32 offset.
600 unsigned Offset = I.getOperand(2).getImm();
601 if (Offset % 32 != 0 || DstSize > 128)
602 return false;
603
604 // 16-bit operations really use 32-bit registers.
605 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
606 if (DstSize == 16)
607 DstSize = 32;
608
609 const TargetRegisterClass *DstRC =
610 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
611 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
612 return false;
613
614 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
615 const TargetRegisterClass *SrcRC =
616 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
617 if (!SrcRC)
618 return false;
620 DstSize / 32);
621 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
622 if (!SrcRC)
623 return false;
624
625 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
626 *SrcRC, I.getOperand(1));
627 const DebugLoc &DL = I.getDebugLoc();
628 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
629 .addReg(SrcReg, 0, SubReg);
630
631 I.eraseFromParent();
632 return true;
633}
634
635bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
636 MachineBasicBlock *BB = MI.getParent();
637 Register DstReg = MI.getOperand(0).getReg();
638 LLT DstTy = MRI->getType(DstReg);
639 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
640
641 const unsigned SrcSize = SrcTy.getSizeInBits();
642 if (SrcSize < 32)
643 return selectImpl(MI, *CoverageInfo);
644
645 const DebugLoc &DL = MI.getDebugLoc();
646 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
647 const unsigned DstSize = DstTy.getSizeInBits();
648 const TargetRegisterClass *DstRC =
649 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
650 if (!DstRC)
651 return false;
652
653 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
655 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
656 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
657 MachineOperand &Src = MI.getOperand(I + 1);
658 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
659 MIB.addImm(SubRegs[I]);
660
661 const TargetRegisterClass *SrcRC
662 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
663 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
664 return false;
665 }
666
667 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
668 return false;
669
670 MI.eraseFromParent();
671 return true;
672}
673
674bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
675 MachineBasicBlock *BB = MI.getParent();
676 const int NumDst = MI.getNumOperands() - 1;
677
678 MachineOperand &Src = MI.getOperand(NumDst);
679
680 Register SrcReg = Src.getReg();
681 Register DstReg0 = MI.getOperand(0).getReg();
682 LLT DstTy = MRI->getType(DstReg0);
683 LLT SrcTy = MRI->getType(SrcReg);
684
685 const unsigned DstSize = DstTy.getSizeInBits();
686 const unsigned SrcSize = SrcTy.getSizeInBits();
687 const DebugLoc &DL = MI.getDebugLoc();
688 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
689
690 const TargetRegisterClass *SrcRC =
691 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
692 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
693 return false;
694
695 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
696 // source, and this relies on the fact that the same subregister indices are
697 // used for both.
698 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
699 for (int I = 0, E = NumDst; I != E; ++I) {
700 MachineOperand &Dst = MI.getOperand(I);
701 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
702 .addReg(SrcReg, 0, SubRegs[I]);
703
704 // Make sure the subregister index is valid for the source register.
705 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
706 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
707 return false;
708
709 const TargetRegisterClass *DstRC =
711 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
712 return false;
713 }
714
715 MI.eraseFromParent();
716 return true;
717}
718
719bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
720 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
721 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
722
723 Register Src0 = MI.getOperand(1).getReg();
724 Register Src1 = MI.getOperand(2).getReg();
725 LLT SrcTy = MRI->getType(Src0);
726 const unsigned SrcSize = SrcTy.getSizeInBits();
727
728 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
729 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
730 return selectG_MERGE_VALUES(MI);
731 }
732
733 // Selection logic below is for V2S16 only.
734 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
735 Register Dst = MI.getOperand(0).getReg();
736 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
737 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
738 SrcTy != LLT::scalar(32)))
739 return selectImpl(MI, *CoverageInfo);
740
741 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
742 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
743 return false;
744
745 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
746 DstBank->getID() == AMDGPU::VGPRRegBankID);
747 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
748
749 const DebugLoc &DL = MI.getDebugLoc();
750 MachineBasicBlock *BB = MI.getParent();
751
752 // First, before trying TableGen patterns, check if both sources are
753 // constants. In those cases, we can trivially compute the final constant
754 // and emit a simple move.
755 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
756 if (ConstSrc1) {
757 auto ConstSrc0 =
758 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
759 if (ConstSrc0) {
760 const int64_t K0 = ConstSrc0->Value.getSExtValue();
761 const int64_t K1 = ConstSrc1->Value.getSExtValue();
762 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
763 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
764 uint32_t Imm = Lo16 | (Hi16 << 16);
765
766 // VALU
767 if (IsVector) {
768 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
769 MI.eraseFromParent();
770 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
771 }
772
773 // SALU
774 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
775 MI.eraseFromParent();
776 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
777 }
778 }
779
780 // Now try TableGen patterns.
781 if (selectImpl(MI, *CoverageInfo))
782 return true;
783
784 // TODO: This should probably be a combine somewhere
785 // (build_vector $src0, undef) -> copy $src0
786 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
787 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
788 MI.setDesc(TII.get(AMDGPU::COPY));
789 MI.removeOperand(2);
790 const auto &RC =
791 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
792 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
793 RBI.constrainGenericRegister(Src0, RC, *MRI);
794 }
795
796 // TODO: Can be improved?
797 if (IsVector) {
798 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
799 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
800 .addImm(0xFFFF)
801 .addReg(Src0);
802 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
803 return false;
804
805 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
806 .addReg(Src1)
807 .addImm(16)
808 .addReg(TmpReg);
809 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
810 return false;
811
812 MI.eraseFromParent();
813 return true;
814 }
815
816 Register ShiftSrc0;
817 Register ShiftSrc1;
818
819 // With multiple uses of the shift, this will duplicate the shift and
820 // increase register pressure.
821 //
822 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
823 // => (S_PACK_HH_B32_B16 $src0, $src1)
824 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
825 // => (S_PACK_HL_B32_B16 $src0, $src1)
826 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
827 // => (S_PACK_LH_B32_B16 $src0, $src1)
828 // (build_vector $src0, $src1)
829 // => (S_PACK_LL_B32_B16 $src0, $src1)
830
831 bool Shift0 = mi_match(
832 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
833
834 bool Shift1 = mi_match(
835 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
836
837 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
838 if (Shift0 && Shift1) {
839 Opc = AMDGPU::S_PACK_HH_B32_B16;
840 MI.getOperand(1).setReg(ShiftSrc0);
841 MI.getOperand(2).setReg(ShiftSrc1);
842 } else if (Shift1) {
843 Opc = AMDGPU::S_PACK_LH_B32_B16;
844 MI.getOperand(2).setReg(ShiftSrc1);
845 } else if (Shift0) {
846 auto ConstSrc1 =
847 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
848 if (ConstSrc1 && ConstSrc1->Value == 0) {
849 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
850 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
851 .addReg(ShiftSrc0)
852 .addImm(16)
853 .setOperandDead(3); // Dead scc
854
855 MI.eraseFromParent();
856 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
857 }
858 if (STI.hasSPackHL()) {
859 Opc = AMDGPU::S_PACK_HL_B32_B16;
860 MI.getOperand(1).setReg(ShiftSrc0);
861 }
862 }
863
864 MI.setDesc(TII.get(Opc));
865 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
866}
867
868bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
869 const MachineOperand &MO = I.getOperand(0);
870
871 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
872 // regbank check here is to know why getConstrainedRegClassForOperand failed.
874 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
875 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
876 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
877 return true;
878 }
879
880 return false;
881}
882
883bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
884 MachineBasicBlock *BB = I.getParent();
885
886 Register DstReg = I.getOperand(0).getReg();
887 Register Src0Reg = I.getOperand(1).getReg();
888 Register Src1Reg = I.getOperand(2).getReg();
889 LLT Src1Ty = MRI->getType(Src1Reg);
890
891 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
892 unsigned InsSize = Src1Ty.getSizeInBits();
893
894 int64_t Offset = I.getOperand(3).getImm();
895
896 // FIXME: These cases should have been illegal and unnecessary to check here.
897 if (Offset % 32 != 0 || InsSize % 32 != 0)
898 return false;
899
900 // Currently not handled by getSubRegFromChannel.
901 if (InsSize > 128)
902 return false;
903
904 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
905 if (SubReg == AMDGPU::NoSubRegister)
906 return false;
907
908 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
909 const TargetRegisterClass *DstRC =
910 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
911 if (!DstRC)
912 return false;
913
914 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
915 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
916 const TargetRegisterClass *Src0RC =
917 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
918 const TargetRegisterClass *Src1RC =
919 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
920
921 // Deal with weird cases where the class only partially supports the subreg
922 // index.
923 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
924 if (!Src0RC || !Src1RC)
925 return false;
926
927 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
928 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
929 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
930 return false;
931
932 const DebugLoc &DL = I.getDebugLoc();
933 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
934 .addReg(Src0Reg)
935 .addReg(Src1Reg)
936 .addImm(SubReg);
937
938 I.eraseFromParent();
939 return true;
940}
941
942bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
943 Register DstReg = MI.getOperand(0).getReg();
944 Register SrcReg = MI.getOperand(1).getReg();
945 Register OffsetReg = MI.getOperand(2).getReg();
946 Register WidthReg = MI.getOperand(3).getReg();
947
948 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
949 "scalar BFX instructions are expanded in regbankselect");
950 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
951 "64-bit vector BFX instructions are expanded in regbankselect");
952
953 const DebugLoc &DL = MI.getDebugLoc();
954 MachineBasicBlock *MBB = MI.getParent();
955
956 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
957 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
958 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
959 .addReg(SrcReg)
960 .addReg(OffsetReg)
961 .addReg(WidthReg);
962 MI.eraseFromParent();
963 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
964}
965
966bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
967 if (STI.getLDSBankCount() != 16)
968 return selectImpl(MI, *CoverageInfo);
969
970 Register Dst = MI.getOperand(0).getReg();
971 Register Src0 = MI.getOperand(2).getReg();
972 Register M0Val = MI.getOperand(6).getReg();
973 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
974 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
975 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
976 return false;
977
978 // This requires 2 instructions. It is possible to write a pattern to support
979 // this, but the generated isel emitter doesn't correctly deal with multiple
980 // output instructions using the same physical register input. The copy to m0
981 // is incorrectly placed before the second instruction.
982 //
983 // TODO: Match source modifiers.
984
985 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
986 const DebugLoc &DL = MI.getDebugLoc();
987 MachineBasicBlock *MBB = MI.getParent();
988
989 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
990 .addReg(M0Val);
991 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
992 .addImm(2)
993 .addImm(MI.getOperand(4).getImm()) // $attr
994 .addImm(MI.getOperand(3).getImm()); // $attrchan
995
996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
997 .addImm(0) // $src0_modifiers
998 .addReg(Src0) // $src0
999 .addImm(MI.getOperand(4).getImm()) // $attr
1000 .addImm(MI.getOperand(3).getImm()) // $attrchan
1001 .addImm(0) // $src2_modifiers
1002 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1003 .addImm(MI.getOperand(5).getImm()) // $high
1004 .addImm(0) // $clamp
1005 .addImm(0); // $omod
1006
1007 MI.eraseFromParent();
1008 return true;
1009}
1010
1011// Writelane is special in that it can use SGPR and M0 (which would normally
1012// count as using the constant bus twice - but in this case it is allowed since
1013// the lane selector doesn't count as a use of the constant bus). However, it is
1014// still required to abide by the 1 SGPR rule. Fix this up if we might have
1015// multiple SGPRs.
1016bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1017 // With a constant bus limit of at least 2, there's no issue.
1018 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1019 return selectImpl(MI, *CoverageInfo);
1020
1021 MachineBasicBlock *MBB = MI.getParent();
1022 const DebugLoc &DL = MI.getDebugLoc();
1023 Register VDst = MI.getOperand(0).getReg();
1024 Register Val = MI.getOperand(2).getReg();
1025 Register LaneSelect = MI.getOperand(3).getReg();
1026 Register VDstIn = MI.getOperand(4).getReg();
1027
1028 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1029
1030 std::optional<ValueAndVReg> ConstSelect =
1031 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1032 if (ConstSelect) {
1033 // The selector has to be an inline immediate, so we can use whatever for
1034 // the other operands.
1035 MIB.addReg(Val);
1036 MIB.addImm(ConstSelect->Value.getSExtValue() &
1037 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1038 } else {
1039 std::optional<ValueAndVReg> ConstVal =
1041
1042 // If the value written is an inline immediate, we can get away without a
1043 // copy to m0.
1044 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1045 STI.hasInv2PiInlineImm())) {
1046 MIB.addImm(ConstVal->Value.getSExtValue());
1047 MIB.addReg(LaneSelect);
1048 } else {
1049 MIB.addReg(Val);
1050
1051 // If the lane selector was originally in a VGPR and copied with
1052 // readfirstlane, there's a hazard to read the same SGPR from the
1053 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1054 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1055
1056 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1057 .addReg(LaneSelect);
1058 MIB.addReg(AMDGPU::M0);
1059 }
1060 }
1061
1062 MIB.addReg(VDstIn);
1063
1064 MI.eraseFromParent();
1065 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1066}
1067
1068// We need to handle this here because tablegen doesn't support matching
1069// instructions with multiple outputs.
1070bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1071 Register Dst0 = MI.getOperand(0).getReg();
1072 Register Dst1 = MI.getOperand(1).getReg();
1073
1074 LLT Ty = MRI->getType(Dst0);
1075 unsigned Opc;
1076 if (Ty == LLT::scalar(32))
1077 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1078 else if (Ty == LLT::scalar(64))
1079 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1080 else
1081 return false;
1082
1083 // TODO: Match source modifiers.
1084
1085 const DebugLoc &DL = MI.getDebugLoc();
1086 MachineBasicBlock *MBB = MI.getParent();
1087
1088 Register Numer = MI.getOperand(3).getReg();
1089 Register Denom = MI.getOperand(4).getReg();
1090 unsigned ChooseDenom = MI.getOperand(5).getImm();
1091
1092 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1093
1094 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1095 .addDef(Dst1)
1096 .addImm(0) // $src0_modifiers
1097 .addUse(Src0) // $src0
1098 .addImm(0) // $src1_modifiers
1099 .addUse(Denom) // $src1
1100 .addImm(0) // $src2_modifiers
1101 .addUse(Numer) // $src2
1102 .addImm(0) // $clamp
1103 .addImm(0); // $omod
1104
1105 MI.eraseFromParent();
1106 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1107}
1108
1109bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1110 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1111 switch (IntrinsicID) {
1112 case Intrinsic::amdgcn_if_break: {
1113 MachineBasicBlock *BB = I.getParent();
1114
1115 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1116 // SelectionDAG uses for wave32 vs wave64.
1117 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1118 .add(I.getOperand(0))
1119 .add(I.getOperand(2))
1120 .add(I.getOperand(3));
1121
1122 Register DstReg = I.getOperand(0).getReg();
1123 Register Src0Reg = I.getOperand(2).getReg();
1124 Register Src1Reg = I.getOperand(3).getReg();
1125
1126 I.eraseFromParent();
1127
1128 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1129 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1130
1131 return true;
1132 }
1133 case Intrinsic::amdgcn_interp_p1_f16:
1134 return selectInterpP1F16(I);
1135 case Intrinsic::amdgcn_wqm:
1136 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1137 case Intrinsic::amdgcn_softwqm:
1138 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1139 case Intrinsic::amdgcn_strict_wwm:
1140 case Intrinsic::amdgcn_wwm:
1141 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1142 case Intrinsic::amdgcn_strict_wqm:
1143 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1144 case Intrinsic::amdgcn_writelane:
1145 return selectWritelane(I);
1146 case Intrinsic::amdgcn_div_scale:
1147 return selectDivScale(I);
1148 case Intrinsic::amdgcn_icmp:
1149 case Intrinsic::amdgcn_fcmp:
1150 if (selectImpl(I, *CoverageInfo))
1151 return true;
1152 return selectIntrinsicCmp(I);
1153 case Intrinsic::amdgcn_ballot:
1154 return selectBallot(I);
1155 case Intrinsic::amdgcn_reloc_constant:
1156 return selectRelocConstant(I);
1157 case Intrinsic::amdgcn_groupstaticsize:
1158 return selectGroupStaticSize(I);
1159 case Intrinsic::returnaddress:
1160 return selectReturnAddress(I);
1161 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1162 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1163 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1164 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1165 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1166 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1167 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1168 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1169 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1170 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1171 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1172 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1173 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1174 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1175 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1176 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1177 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1178 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1179 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1180 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1181 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1182 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1183 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1184 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1185 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1186 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1187 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1189 return selectSMFMACIntrin(I);
1190 case Intrinsic::amdgcn_permlane16_swap:
1191 case Intrinsic::amdgcn_permlane32_swap:
1192 return selectPermlaneSwapIntrin(I, IntrinsicID);
1193 default:
1194 return selectImpl(I, *CoverageInfo);
1195 }
1196}
1197
1199 const GCNSubtarget &ST) {
1200 if (Size != 16 && Size != 32 && Size != 64)
1201 return -1;
1202
1203 if (Size == 16 && !ST.has16BitInsts())
1204 return -1;
1205
1206 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1207 unsigned FakeS16Opc, unsigned S32Opc,
1208 unsigned S64Opc) {
1209 if (Size == 16)
1210 // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code
1211 return ST.hasTrue16BitInsts()
1212 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
1213 : S16Opc;
1214 if (Size == 32)
1215 return S32Opc;
1216 return S64Opc;
1217 };
1218
1219 switch (P) {
1220 default:
1221 llvm_unreachable("Unknown condition code!");
1222 case CmpInst::ICMP_NE:
1223 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1224 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1225 AMDGPU::V_CMP_NE_U64_e64);
1226 case CmpInst::ICMP_EQ:
1227 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1228 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1229 AMDGPU::V_CMP_EQ_U64_e64);
1230 case CmpInst::ICMP_SGT:
1231 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1232 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1233 AMDGPU::V_CMP_GT_I64_e64);
1234 case CmpInst::ICMP_SGE:
1235 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1236 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1237 AMDGPU::V_CMP_GE_I64_e64);
1238 case CmpInst::ICMP_SLT:
1239 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1240 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1241 AMDGPU::V_CMP_LT_I64_e64);
1242 case CmpInst::ICMP_SLE:
1243 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1244 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1245 AMDGPU::V_CMP_LE_I64_e64);
1246 case CmpInst::ICMP_UGT:
1247 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1248 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1249 AMDGPU::V_CMP_GT_U64_e64);
1250 case CmpInst::ICMP_UGE:
1251 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1252 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1253 AMDGPU::V_CMP_GE_U64_e64);
1254 case CmpInst::ICMP_ULT:
1255 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1256 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1257 AMDGPU::V_CMP_LT_U64_e64);
1258 case CmpInst::ICMP_ULE:
1259 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1260 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1261 AMDGPU::V_CMP_LE_U64_e64);
1262
1263 case CmpInst::FCMP_OEQ:
1264 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1265 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1266 AMDGPU::V_CMP_EQ_F64_e64);
1267 case CmpInst::FCMP_OGT:
1268 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1269 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1270 AMDGPU::V_CMP_GT_F64_e64);
1271 case CmpInst::FCMP_OGE:
1272 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1273 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1274 AMDGPU::V_CMP_GE_F64_e64);
1275 case CmpInst::FCMP_OLT:
1276 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1277 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1278 AMDGPU::V_CMP_LT_F64_e64);
1279 case CmpInst::FCMP_OLE:
1280 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1281 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1282 AMDGPU::V_CMP_LE_F64_e64);
1283 case CmpInst::FCMP_ONE:
1284 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1285 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1286 AMDGPU::V_CMP_NEQ_F64_e64);
1287 case CmpInst::FCMP_ORD:
1288 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1289 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1290 AMDGPU::V_CMP_O_F64_e64);
1291 case CmpInst::FCMP_UNO:
1292 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1293 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1294 AMDGPU::V_CMP_U_F64_e64);
1295 case CmpInst::FCMP_UEQ:
1296 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1297 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1298 AMDGPU::V_CMP_NLG_F64_e64);
1299 case CmpInst::FCMP_UGT:
1300 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1301 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1302 AMDGPU::V_CMP_NLE_F64_e64);
1303 case CmpInst::FCMP_UGE:
1304 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1305 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1306 AMDGPU::V_CMP_NLT_F64_e64);
1307 case CmpInst::FCMP_ULT:
1308 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1309 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1310 AMDGPU::V_CMP_NGE_F64_e64);
1311 case CmpInst::FCMP_ULE:
1312 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1313 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1314 AMDGPU::V_CMP_NGT_F64_e64);
1315 case CmpInst::FCMP_UNE:
1316 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1317 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1318 AMDGPU::V_CMP_NEQ_F64_e64);
1319 case CmpInst::FCMP_TRUE:
1320 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1321 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1322 AMDGPU::V_CMP_TRU_F64_e64);
1324 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1325 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1326 AMDGPU::V_CMP_F_F64_e64);
1327 }
1328}
1329
1330int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1331 unsigned Size) const {
1332 if (Size == 64) {
1333 if (!STI.hasScalarCompareEq64())
1334 return -1;
1335
1336 switch (P) {
1337 case CmpInst::ICMP_NE:
1338 return AMDGPU::S_CMP_LG_U64;
1339 case CmpInst::ICMP_EQ:
1340 return AMDGPU::S_CMP_EQ_U64;
1341 default:
1342 return -1;
1343 }
1344 }
1345
1346 if (Size == 32) {
1347 switch (P) {
1348 case CmpInst::ICMP_NE:
1349 return AMDGPU::S_CMP_LG_U32;
1350 case CmpInst::ICMP_EQ:
1351 return AMDGPU::S_CMP_EQ_U32;
1352 case CmpInst::ICMP_SGT:
1353 return AMDGPU::S_CMP_GT_I32;
1354 case CmpInst::ICMP_SGE:
1355 return AMDGPU::S_CMP_GE_I32;
1356 case CmpInst::ICMP_SLT:
1357 return AMDGPU::S_CMP_LT_I32;
1358 case CmpInst::ICMP_SLE:
1359 return AMDGPU::S_CMP_LE_I32;
1360 case CmpInst::ICMP_UGT:
1361 return AMDGPU::S_CMP_GT_U32;
1362 case CmpInst::ICMP_UGE:
1363 return AMDGPU::S_CMP_GE_U32;
1364 case CmpInst::ICMP_ULT:
1365 return AMDGPU::S_CMP_LT_U32;
1366 case CmpInst::ICMP_ULE:
1367 return AMDGPU::S_CMP_LE_U32;
1368 case CmpInst::FCMP_OEQ:
1369 return AMDGPU::S_CMP_EQ_F32;
1370 case CmpInst::FCMP_OGT:
1371 return AMDGPU::S_CMP_GT_F32;
1372 case CmpInst::FCMP_OGE:
1373 return AMDGPU::S_CMP_GE_F32;
1374 case CmpInst::FCMP_OLT:
1375 return AMDGPU::S_CMP_LT_F32;
1376 case CmpInst::FCMP_OLE:
1377 return AMDGPU::S_CMP_LE_F32;
1378 case CmpInst::FCMP_ONE:
1379 return AMDGPU::S_CMP_LG_F32;
1380 case CmpInst::FCMP_ORD:
1381 return AMDGPU::S_CMP_O_F32;
1382 case CmpInst::FCMP_UNO:
1383 return AMDGPU::S_CMP_U_F32;
1384 case CmpInst::FCMP_UEQ:
1385 return AMDGPU::S_CMP_NLG_F32;
1386 case CmpInst::FCMP_UGT:
1387 return AMDGPU::S_CMP_NLE_F32;
1388 case CmpInst::FCMP_UGE:
1389 return AMDGPU::S_CMP_NLT_F32;
1390 case CmpInst::FCMP_ULT:
1391 return AMDGPU::S_CMP_NGE_F32;
1392 case CmpInst::FCMP_ULE:
1393 return AMDGPU::S_CMP_NGT_F32;
1394 case CmpInst::FCMP_UNE:
1395 return AMDGPU::S_CMP_NEQ_F32;
1396 default:
1397 llvm_unreachable("Unknown condition code!");
1398 }
1399 }
1400
1401 if (Size == 16) {
1402 if (!STI.hasSALUFloatInsts())
1403 return -1;
1404
1405 switch (P) {
1406 case CmpInst::FCMP_OEQ:
1407 return AMDGPU::S_CMP_EQ_F16;
1408 case CmpInst::FCMP_OGT:
1409 return AMDGPU::S_CMP_GT_F16;
1410 case CmpInst::FCMP_OGE:
1411 return AMDGPU::S_CMP_GE_F16;
1412 case CmpInst::FCMP_OLT:
1413 return AMDGPU::S_CMP_LT_F16;
1414 case CmpInst::FCMP_OLE:
1415 return AMDGPU::S_CMP_LE_F16;
1416 case CmpInst::FCMP_ONE:
1417 return AMDGPU::S_CMP_LG_F16;
1418 case CmpInst::FCMP_ORD:
1419 return AMDGPU::S_CMP_O_F16;
1420 case CmpInst::FCMP_UNO:
1421 return AMDGPU::S_CMP_U_F16;
1422 case CmpInst::FCMP_UEQ:
1423 return AMDGPU::S_CMP_NLG_F16;
1424 case CmpInst::FCMP_UGT:
1425 return AMDGPU::S_CMP_NLE_F16;
1426 case CmpInst::FCMP_UGE:
1427 return AMDGPU::S_CMP_NLT_F16;
1428 case CmpInst::FCMP_ULT:
1429 return AMDGPU::S_CMP_NGE_F16;
1430 case CmpInst::FCMP_ULE:
1431 return AMDGPU::S_CMP_NGT_F16;
1432 case CmpInst::FCMP_UNE:
1433 return AMDGPU::S_CMP_NEQ_F16;
1434 default:
1435 llvm_unreachable("Unknown condition code!");
1436 }
1437 }
1438
1439 return -1;
1440}
1441
1442bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1443
1444 MachineBasicBlock *BB = I.getParent();
1445 const DebugLoc &DL = I.getDebugLoc();
1446
1447 Register SrcReg = I.getOperand(2).getReg();
1448 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1449
1450 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1451
1452 Register CCReg = I.getOperand(0).getReg();
1453 if (!isVCC(CCReg, *MRI)) {
1454 int Opcode = getS_CMPOpcode(Pred, Size);
1455 if (Opcode == -1)
1456 return false;
1457 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1458 .add(I.getOperand(2))
1459 .add(I.getOperand(3));
1460 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1461 .addReg(AMDGPU::SCC);
1462 bool Ret =
1463 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1464 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1465 I.eraseFromParent();
1466 return Ret;
1467 }
1468
1469 if (I.getOpcode() == AMDGPU::G_FCMP)
1470 return false;
1471
1472 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1473 if (Opcode == -1)
1474 return false;
1475
1476 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1477 I.getOperand(0).getReg())
1478 .add(I.getOperand(2))
1479 .add(I.getOperand(3));
1481 *TRI.getBoolRC(), *MRI);
1482 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1483 I.eraseFromParent();
1484 return Ret;
1485}
1486
1487bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1488 Register Dst = I.getOperand(0).getReg();
1489 if (isVCC(Dst, *MRI))
1490 return false;
1491
1492 LLT DstTy = MRI->getType(Dst);
1493 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1494 return false;
1495
1496 MachineBasicBlock *BB = I.getParent();
1497 const DebugLoc &DL = I.getDebugLoc();
1498 Register SrcReg = I.getOperand(2).getReg();
1499 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1500
1501 // i1 inputs are not supported in GlobalISel.
1502 if (Size == 1)
1503 return false;
1504
1505 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1506 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1507 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1508 I.eraseFromParent();
1509 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1510 }
1511
1512 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1513 if (Opcode == -1)
1514 return false;
1515
1516 MachineInstrBuilder SelectedMI;
1517 MachineOperand &LHS = I.getOperand(2);
1518 MachineOperand &RHS = I.getOperand(3);
1519 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1520 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1521 Register Src0Reg =
1522 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1523 Register Src1Reg =
1524 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1525 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1526 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1527 SelectedMI.addImm(Src0Mods);
1528 SelectedMI.addReg(Src0Reg);
1529 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1530 SelectedMI.addImm(Src1Mods);
1531 SelectedMI.addReg(Src1Reg);
1532 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1533 SelectedMI.addImm(0); // clamp
1534 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1535 SelectedMI.addImm(0); // op_sel
1536
1537 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1538 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1539 return false;
1540
1541 I.eraseFromParent();
1542 return true;
1543}
1544
1545// Ballot has to zero bits in input lane-mask that are zero in current exec,
1546// Done as AND with exec. For inputs that are results of instruction that
1547// implicitly use same exec, for example compares in same basic block or SCC to
1548// VCC copy, use copy.
1551 MachineInstr *MI = MRI.getVRegDef(Reg);
1552 if (MI->getParent() != MBB)
1553 return false;
1554
1555 // Lane mask generated by SCC to VCC copy.
1556 if (MI->getOpcode() == AMDGPU::COPY) {
1557 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1558 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1559 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1560 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1561 return true;
1562 }
1563
1564 // Lane mask generated using compare with same exec.
1565 if (isa<GAnyCmp>(MI))
1566 return true;
1567
1568 Register LHS, RHS;
1569 // Look through AND.
1570 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1571 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1573
1574 return false;
1575}
1576
1577bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1578 MachineBasicBlock *BB = I.getParent();
1579 const DebugLoc &DL = I.getDebugLoc();
1580 Register DstReg = I.getOperand(0).getReg();
1581 Register SrcReg = I.getOperand(2).getReg();
1582 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1583 const unsigned WaveSize = STI.getWavefrontSize();
1584
1585 // In the common case, the return type matches the wave size.
1586 // However we also support emitting i64 ballots in wave32 mode.
1587 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1588 return false;
1589
1590 std::optional<ValueAndVReg> Arg =
1592
1593 Register Dst = DstReg;
1594 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1595 if (BallotSize != WaveSize) {
1596 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1597 }
1598
1599 if (Arg) {
1600 const int64_t Value = Arg->Value.getZExtValue();
1601 if (Value == 0) {
1602 // Dst = S_MOV 0
1603 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1604 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1605 } else {
1606 // Dst = COPY EXEC
1607 assert(Value == 1);
1608 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1609 }
1610 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1611 return false;
1612 } else {
1613 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1614 // Dst = COPY SrcReg
1615 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1616 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1617 return false;
1618 } else {
1619 // Dst = S_AND SrcReg, EXEC
1620 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1621 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1622 .addReg(SrcReg)
1623 .addReg(TRI.getExec())
1624 .setOperandDead(3); // Dead scc
1625 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1626 return false;
1627 }
1628 }
1629
1630 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1631 if (BallotSize != WaveSize) {
1632 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1633 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1634 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1635 .addReg(Dst)
1636 .addImm(AMDGPU::sub0)
1637 .addReg(HiReg)
1638 .addImm(AMDGPU::sub1);
1639 }
1640
1641 I.eraseFromParent();
1642 return true;
1643}
1644
1645bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1646 Register DstReg = I.getOperand(0).getReg();
1647 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1648 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1649 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1650 return false;
1651
1652 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1653
1655 const MDNode *Metadata = I.getOperand(2).getMetadata();
1656 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1657 auto *RelocSymbol = cast<GlobalVariable>(
1658 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1659
1660 MachineBasicBlock *BB = I.getParent();
1661 BuildMI(*BB, &I, I.getDebugLoc(),
1662 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1664
1665 I.eraseFromParent();
1666 return true;
1667}
1668
1669bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1671
1672 Register DstReg = I.getOperand(0).getReg();
1673 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1674 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1675 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1676
1677 MachineBasicBlock *MBB = I.getParent();
1678 const DebugLoc &DL = I.getDebugLoc();
1679
1680 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1681
1682 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1684 MIB.addImm(MFI->getLDSSize());
1685 } else {
1687 const GlobalValue *GV =
1688 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1690 }
1691
1692 I.eraseFromParent();
1693 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1694}
1695
1696bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1697 MachineBasicBlock *MBB = I.getParent();
1699 const DebugLoc &DL = I.getDebugLoc();
1700
1701 MachineOperand &Dst = I.getOperand(0);
1702 Register DstReg = Dst.getReg();
1703 unsigned Depth = I.getOperand(2).getImm();
1704
1705 const TargetRegisterClass *RC
1706 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1707 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1708 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1709 return false;
1710
1711 // Check for kernel and shader functions
1712 if (Depth != 0 ||
1714 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1715 .addImm(0);
1716 I.eraseFromParent();
1717 return true;
1718 }
1719
1721 // There is a call to @llvm.returnaddress in this function
1722 MFI.setReturnAddressIsTaken(true);
1723
1724 // Get the return address reg and mark it as an implicit live-in
1725 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1726 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1727 AMDGPU::SReg_64RegClass, DL);
1728 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1729 .addReg(LiveIn);
1730 I.eraseFromParent();
1731 return true;
1732}
1733
1734bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1735 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1736 // SelectionDAG uses for wave32 vs wave64.
1737 MachineBasicBlock *BB = MI.getParent();
1738 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1739 .add(MI.getOperand(1));
1740
1741 Register Reg = MI.getOperand(1).getReg();
1742 MI.eraseFromParent();
1743
1744 if (!MRI->getRegClassOrNull(Reg))
1745 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1746 return true;
1747}
1748
1749bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1750 MachineInstr &MI, Intrinsic::ID IntrID) const {
1751 MachineBasicBlock *MBB = MI.getParent();
1753 const DebugLoc &DL = MI.getDebugLoc();
1754
1755 unsigned IndexOperand = MI.getOperand(7).getImm();
1756 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1757 bool WaveDone = MI.getOperand(9).getImm() != 0;
1758
1759 if (WaveDone && !WaveRelease)
1760 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1761
1762 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1763 IndexOperand &= ~0x3f;
1764 unsigned CountDw = 0;
1765
1767 CountDw = (IndexOperand >> 24) & 0xf;
1768 IndexOperand &= ~(0xf << 24);
1769
1770 if (CountDw < 1 || CountDw > 4) {
1772 "ds_ordered_count: dword count must be between 1 and 4");
1773 }
1774 }
1775
1776 if (IndexOperand)
1777 report_fatal_error("ds_ordered_count: bad index operand");
1778
1779 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1780 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1781
1782 unsigned Offset0 = OrderedCountIndex << 2;
1783 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1784
1786 Offset1 |= (CountDw - 1) << 6;
1787
1789 Offset1 |= ShaderType << 2;
1790
1791 unsigned Offset = Offset0 | (Offset1 << 8);
1792
1793 Register M0Val = MI.getOperand(2).getReg();
1794 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1795 .addReg(M0Val);
1796
1797 Register DstReg = MI.getOperand(0).getReg();
1798 Register ValReg = MI.getOperand(3).getReg();
1800 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1801 .addReg(ValReg)
1802 .addImm(Offset)
1803 .cloneMemRefs(MI);
1804
1805 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1806 return false;
1807
1808 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1809 MI.eraseFromParent();
1810 return Ret;
1811}
1812
1813static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1814 switch (IntrID) {
1815 case Intrinsic::amdgcn_ds_gws_init:
1816 return AMDGPU::DS_GWS_INIT;
1817 case Intrinsic::amdgcn_ds_gws_barrier:
1818 return AMDGPU::DS_GWS_BARRIER;
1819 case Intrinsic::amdgcn_ds_gws_sema_v:
1820 return AMDGPU::DS_GWS_SEMA_V;
1821 case Intrinsic::amdgcn_ds_gws_sema_br:
1822 return AMDGPU::DS_GWS_SEMA_BR;
1823 case Intrinsic::amdgcn_ds_gws_sema_p:
1824 return AMDGPU::DS_GWS_SEMA_P;
1825 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1826 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1827 default:
1828 llvm_unreachable("not a gws intrinsic");
1829 }
1830}
1831
1832bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1833 Intrinsic::ID IID) const {
1834 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1835 !STI.hasGWSSemaReleaseAll()))
1836 return false;
1837
1838 // intrinsic ID, vsrc, offset
1839 const bool HasVSrc = MI.getNumOperands() == 3;
1840 assert(HasVSrc || MI.getNumOperands() == 2);
1841
1842 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1843 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1844 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1845 return false;
1846
1847 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1848 unsigned ImmOffset;
1849
1850 MachineBasicBlock *MBB = MI.getParent();
1851 const DebugLoc &DL = MI.getDebugLoc();
1852
1853 MachineInstr *Readfirstlane = nullptr;
1854
1855 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1856 // incoming offset, in case there's an add of a constant. We'll have to put it
1857 // back later.
1858 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1859 Readfirstlane = OffsetDef;
1860 BaseOffset = OffsetDef->getOperand(1).getReg();
1861 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1862 }
1863
1864 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1865 // If we have a constant offset, try to use the 0 in m0 as the base.
1866 // TODO: Look into changing the default m0 initialization value. If the
1867 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1868 // the immediate offset.
1869
1870 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1871 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1872 .addImm(0);
1873 } else {
1874 std::tie(BaseOffset, ImmOffset) =
1875 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1876
1877 if (Readfirstlane) {
1878 // We have the constant offset now, so put the readfirstlane back on the
1879 // variable component.
1880 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1881 return false;
1882
1883 Readfirstlane->getOperand(1).setReg(BaseOffset);
1884 BaseOffset = Readfirstlane->getOperand(0).getReg();
1885 } else {
1886 if (!RBI.constrainGenericRegister(BaseOffset,
1887 AMDGPU::SReg_32RegClass, *MRI))
1888 return false;
1889 }
1890
1891 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1893 .addReg(BaseOffset)
1894 .addImm(16)
1895 .setOperandDead(3); // Dead scc
1896
1897 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1898 .addReg(M0Base);
1899 }
1900
1901 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1902 // offset field) % 64. Some versions of the programming guide omit the m0
1903 // part, or claim it's from offset 0.
1904 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1905
1906 if (HasVSrc) {
1907 Register VSrc = MI.getOperand(1).getReg();
1908 MIB.addReg(VSrc);
1909
1910 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1911 return false;
1912 }
1913
1914 MIB.addImm(ImmOffset)
1915 .cloneMemRefs(MI);
1916
1917 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1918
1919 MI.eraseFromParent();
1920 return true;
1921}
1922
1923bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1924 bool IsAppend) const {
1925 Register PtrBase = MI.getOperand(2).getReg();
1926 LLT PtrTy = MRI->getType(PtrBase);
1927 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1928
1929 unsigned Offset;
1930 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1931
1932 // TODO: Should this try to look through readfirstlane like GWS?
1933 if (!isDSOffsetLegal(PtrBase, Offset)) {
1934 PtrBase = MI.getOperand(2).getReg();
1935 Offset = 0;
1936 }
1937
1938 MachineBasicBlock *MBB = MI.getParent();
1939 const DebugLoc &DL = MI.getDebugLoc();
1940 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1941
1942 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1943 .addReg(PtrBase);
1944 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1945 return false;
1946
1947 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1948 .addImm(Offset)
1949 .addImm(IsGDS ? -1 : 0)
1950 .cloneMemRefs(MI);
1951 MI.eraseFromParent();
1952 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1953}
1954
1955bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1956 MachineFunction *MF = MI.getParent()->getParent();
1958
1959 MFInfo->setInitWholeWave();
1960 return selectImpl(MI, *CoverageInfo);
1961}
1962
1963bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1964 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1966 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1967 if (WGSize <= STI.getWavefrontSize()) {
1968 // If the workgroup fits in a wave, remove s_barrier_signal and lower
1969 // s_barrier/s_barrier_wait to wave_barrier.
1970 if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1971 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1972 MachineBasicBlock *MBB = MI.getParent();
1973 const DebugLoc &DL = MI.getDebugLoc();
1974 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1975 }
1976 MI.eraseFromParent();
1977 return true;
1978 }
1979 }
1980
1981 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1982 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1983 MachineBasicBlock *MBB = MI.getParent();
1984 const DebugLoc &DL = MI.getDebugLoc();
1985 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1987 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1989 MI.eraseFromParent();
1990 return true;
1991 }
1992
1993 return selectImpl(MI, *CoverageInfo);
1994}
1995
1996static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1997 bool &IsTexFail) {
1998 if (TexFailCtrl)
1999 IsTexFail = true;
2000
2001 TFE = (TexFailCtrl & 0x1) ? true : false;
2002 TexFailCtrl &= ~(uint64_t)0x1;
2003 LWE = (TexFailCtrl & 0x2) ? true : false;
2004 TexFailCtrl &= ~(uint64_t)0x2;
2005
2006 return TexFailCtrl == 0;
2007}
2008
2009bool AMDGPUInstructionSelector::selectImageIntrinsic(
2011 MachineBasicBlock *MBB = MI.getParent();
2012 const DebugLoc &DL = MI.getDebugLoc();
2013
2014 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2016
2017 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2018 unsigned IntrOpcode = Intr->BaseOpcode;
2019 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2020 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2021 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2022
2023 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2024
2025 Register VDataIn, VDataOut;
2026 LLT VDataTy;
2027 int NumVDataDwords = -1;
2028 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2029 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2030
2031 bool Unorm;
2032 if (!BaseOpcode->Sampler)
2033 Unorm = true;
2034 else
2035 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2036
2037 bool TFE;
2038 bool LWE;
2039 bool IsTexFail = false;
2040 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2041 TFE, LWE, IsTexFail))
2042 return false;
2043
2044 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2045 const bool IsA16 = (Flags & 1) != 0;
2046 const bool IsG16 = (Flags & 2) != 0;
2047
2048 // A16 implies 16 bit gradients if subtarget doesn't support G16
2049 if (IsA16 && !STI.hasG16() && !IsG16)
2050 return false;
2051
2052 unsigned DMask = 0;
2053 unsigned DMaskLanes = 0;
2054
2055 if (BaseOpcode->Atomic) {
2056 VDataOut = MI.getOperand(0).getReg();
2057 VDataIn = MI.getOperand(2).getReg();
2058 LLT Ty = MRI->getType(VDataIn);
2059
2060 // Be careful to allow atomic swap on 16-bit element vectors.
2061 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2062 Ty.getSizeInBits() == 128 :
2063 Ty.getSizeInBits() == 64;
2064
2065 if (BaseOpcode->AtomicX2) {
2066 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2067
2068 DMask = Is64Bit ? 0xf : 0x3;
2069 NumVDataDwords = Is64Bit ? 4 : 2;
2070 } else {
2071 DMask = Is64Bit ? 0x3 : 0x1;
2072 NumVDataDwords = Is64Bit ? 2 : 1;
2073 }
2074 } else {
2075 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2076 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2077
2078 if (BaseOpcode->Store) {
2079 VDataIn = MI.getOperand(1).getReg();
2080 VDataTy = MRI->getType(VDataIn);
2081 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2082 } else if (BaseOpcode->NoReturn) {
2083 NumVDataDwords = 0;
2084 } else {
2085 VDataOut = MI.getOperand(0).getReg();
2086 VDataTy = MRI->getType(VDataOut);
2087 NumVDataDwords = DMaskLanes;
2088
2089 if (IsD16 && !STI.hasUnpackedD16VMem())
2090 NumVDataDwords = (DMaskLanes + 1) / 2;
2091 }
2092 }
2093
2094 // Set G16 opcode
2095 if (Subtarget->hasG16() && IsG16) {
2096 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2098 assert(G16MappingInfo);
2099 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2100 }
2101
2102 // TODO: Check this in verifier.
2103 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2104
2105 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2106 if (BaseOpcode->Atomic)
2107 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2108 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2110 return false;
2111
2112 int NumVAddrRegs = 0;
2113 int NumVAddrDwords = 0;
2114 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2115 // Skip the $noregs and 0s inserted during legalization.
2116 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2117 if (!AddrOp.isReg())
2118 continue; // XXX - Break?
2119
2120 Register Addr = AddrOp.getReg();
2121 if (!Addr)
2122 break;
2123
2124 ++NumVAddrRegs;
2125 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2126 }
2127
2128 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2129 // NSA, these should have been packed into a single value in the first
2130 // address register
2131 const bool UseNSA =
2132 NumVAddrRegs != 1 &&
2133 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2134 : NumVAddrDwords == NumVAddrRegs);
2135 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2136 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2137 return false;
2138 }
2139
2140 if (IsTexFail)
2141 ++NumVDataDwords;
2142
2143 int Opcode = -1;
2144 if (IsGFX12Plus) {
2145 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2146 NumVDataDwords, NumVAddrDwords);
2147 } else if (IsGFX11Plus) {
2148 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2149 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2150 : AMDGPU::MIMGEncGfx11Default,
2151 NumVDataDwords, NumVAddrDwords);
2152 } else if (IsGFX10Plus) {
2153 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2154 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2155 : AMDGPU::MIMGEncGfx10Default,
2156 NumVDataDwords, NumVAddrDwords);
2157 } else {
2158 if (Subtarget->hasGFX90AInsts()) {
2159 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2160 NumVDataDwords, NumVAddrDwords);
2161 if (Opcode == -1) {
2162 LLVM_DEBUG(
2163 dbgs()
2164 << "requested image instruction is not supported on this GPU\n");
2165 return false;
2166 }
2167 }
2168 if (Opcode == -1 &&
2170 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2171 NumVDataDwords, NumVAddrDwords);
2172 if (Opcode == -1)
2173 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2174 NumVDataDwords, NumVAddrDwords);
2175 }
2176 if (Opcode == -1)
2177 return false;
2178
2179 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2180 .cloneMemRefs(MI);
2181
2182 if (VDataOut) {
2183 if (BaseOpcode->AtomicX2) {
2184 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2185
2186 Register TmpReg = MRI->createVirtualRegister(
2187 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2188 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2189
2190 MIB.addDef(TmpReg);
2191 if (!MRI->use_empty(VDataOut)) {
2192 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2193 .addReg(TmpReg, RegState::Kill, SubReg);
2194 }
2195
2196 } else {
2197 MIB.addDef(VDataOut); // vdata output
2198 }
2199 }
2200
2201 if (VDataIn)
2202 MIB.addReg(VDataIn); // vdata input
2203
2204 for (int I = 0; I != NumVAddrRegs; ++I) {
2205 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2206 if (SrcOp.isReg()) {
2207 assert(SrcOp.getReg() != 0);
2208 MIB.addReg(SrcOp.getReg());
2209 }
2210 }
2211
2212 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2213 if (BaseOpcode->Sampler)
2214 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2215
2216 MIB.addImm(DMask); // dmask
2217
2218 if (IsGFX10Plus)
2219 MIB.addImm(DimInfo->Encoding);
2220 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2221 MIB.addImm(Unorm);
2222
2223 MIB.addImm(CPol);
2224 MIB.addImm(IsA16 && // a16 or r128
2225 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2226 if (IsGFX10Plus)
2227 MIB.addImm(IsA16 ? -1 : 0);
2228
2229 if (!Subtarget->hasGFX90AInsts()) {
2230 MIB.addImm(TFE); // tfe
2231 } else if (TFE) {
2232 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2233 return false;
2234 }
2235
2236 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2237 MIB.addImm(LWE); // lwe
2238 if (!IsGFX10Plus)
2239 MIB.addImm(DimInfo->DA ? -1 : 0);
2240 if (BaseOpcode->HasD16)
2241 MIB.addImm(IsD16 ? -1 : 0);
2242
2243 MI.eraseFromParent();
2244 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2245 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2246 return true;
2247}
2248
2249// We need to handle this here because tablegen doesn't support matching
2250// instructions with multiple outputs.
2251bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2252 MachineInstr &MI) const {
2253 Register Dst0 = MI.getOperand(0).getReg();
2254 Register Dst1 = MI.getOperand(1).getReg();
2255
2256 const DebugLoc &DL = MI.getDebugLoc();
2257 MachineBasicBlock *MBB = MI.getParent();
2258
2259 Register Addr = MI.getOperand(3).getReg();
2260 Register Data0 = MI.getOperand(4).getReg();
2261 Register Data1 = MI.getOperand(5).getReg();
2262 unsigned Offset = MI.getOperand(6).getImm();
2263
2264 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2265 .addDef(Dst1)
2266 .addUse(Addr)
2267 .addUse(Data0)
2268 .addUse(Data1)
2269 .addImm(Offset)
2270 .cloneMemRefs(MI);
2271
2272 MI.eraseFromParent();
2273 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2274}
2275
2276bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2277 MachineInstr &I) const {
2278 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2279 switch (IntrinsicID) {
2280 case Intrinsic::amdgcn_end_cf:
2281 return selectEndCfIntrinsic(I);
2282 case Intrinsic::amdgcn_ds_ordered_add:
2283 case Intrinsic::amdgcn_ds_ordered_swap:
2284 return selectDSOrderedIntrinsic(I, IntrinsicID);
2285 case Intrinsic::amdgcn_ds_gws_init:
2286 case Intrinsic::amdgcn_ds_gws_barrier:
2287 case Intrinsic::amdgcn_ds_gws_sema_v:
2288 case Intrinsic::amdgcn_ds_gws_sema_br:
2289 case Intrinsic::amdgcn_ds_gws_sema_p:
2290 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2291 return selectDSGWSIntrinsic(I, IntrinsicID);
2292 case Intrinsic::amdgcn_ds_append:
2293 return selectDSAppendConsume(I, true);
2294 case Intrinsic::amdgcn_ds_consume:
2295 return selectDSAppendConsume(I, false);
2296 case Intrinsic::amdgcn_init_whole_wave:
2297 return selectInitWholeWave(I);
2298 case Intrinsic::amdgcn_s_barrier:
2299 case Intrinsic::amdgcn_s_barrier_signal:
2300 case Intrinsic::amdgcn_s_barrier_wait:
2301 return selectSBarrier(I);
2302 case Intrinsic::amdgcn_raw_buffer_load_lds:
2303 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2304 case Intrinsic::amdgcn_struct_buffer_load_lds:
2305 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2306 return selectBufferLoadLds(I);
2307 case Intrinsic::amdgcn_global_load_lds:
2308 return selectGlobalLoadLds(I);
2309 case Intrinsic::amdgcn_exp_compr:
2310 if (!STI.hasCompressedExport()) {
2311 Function &F = I.getMF()->getFunction();
2313 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2314 F.getContext().diagnose(NoFpRet);
2315 return false;
2316 }
2317 break;
2318 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2319 return selectDSBvhStackIntrinsic(I);
2320 case Intrinsic::amdgcn_s_barrier_init:
2321 case Intrinsic::amdgcn_s_barrier_signal_var:
2322 return selectNamedBarrierInit(I, IntrinsicID);
2323 case Intrinsic::amdgcn_s_barrier_join:
2324 case Intrinsic::amdgcn_s_get_named_barrier_state:
2325 return selectNamedBarrierInst(I, IntrinsicID);
2326 case Intrinsic::amdgcn_s_get_barrier_state:
2327 return selectSGetBarrierState(I, IntrinsicID);
2328 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2329 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2330 }
2331 return selectImpl(I, *CoverageInfo);
2332}
2333
2334bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2335 if (selectImpl(I, *CoverageInfo))
2336 return true;
2337
2338 MachineBasicBlock *BB = I.getParent();
2339 const DebugLoc &DL = I.getDebugLoc();
2340
2341 Register DstReg = I.getOperand(0).getReg();
2342 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2343 assert(Size <= 32 || Size == 64);
2344 const MachineOperand &CCOp = I.getOperand(1);
2345 Register CCReg = CCOp.getReg();
2346 if (!isVCC(CCReg, *MRI)) {
2347 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2348 AMDGPU::S_CSELECT_B32;
2349 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2350 .addReg(CCReg);
2351
2352 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2353 // bank, because it does not cover the register class that we used to represent
2354 // for it. So we need to manually set the register class here.
2355 if (!MRI->getRegClassOrNull(CCReg))
2356 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2357 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2358 .add(I.getOperand(2))
2359 .add(I.getOperand(3));
2360
2361 bool Ret = false;
2362 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2363 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2364 I.eraseFromParent();
2365 return Ret;
2366 }
2367
2368 // Wide VGPR select should have been split in RegBankSelect.
2369 if (Size > 32)
2370 return false;
2371
2373 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2374 .addImm(0)
2375 .add(I.getOperand(3))
2376 .addImm(0)
2377 .add(I.getOperand(2))
2378 .add(I.getOperand(1));
2379
2380 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2381 I.eraseFromParent();
2382 return Ret;
2383}
2384
2385bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2386 Register DstReg = I.getOperand(0).getReg();
2387 Register SrcReg = I.getOperand(1).getReg();
2388 const LLT DstTy = MRI->getType(DstReg);
2389 const LLT SrcTy = MRI->getType(SrcReg);
2390 const LLT S1 = LLT::scalar(1);
2391
2392 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2393 const RegisterBank *DstRB;
2394 if (DstTy == S1) {
2395 // This is a special case. We don't treat s1 for legalization artifacts as
2396 // vcc booleans.
2397 DstRB = SrcRB;
2398 } else {
2399 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2400 if (SrcRB != DstRB)
2401 return false;
2402 }
2403
2404 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2405
2406 unsigned DstSize = DstTy.getSizeInBits();
2407 unsigned SrcSize = SrcTy.getSizeInBits();
2408
2409 const TargetRegisterClass *SrcRC =
2410 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2411 const TargetRegisterClass *DstRC =
2412 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2413 if (!SrcRC || !DstRC)
2414 return false;
2415
2416 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2417 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2418 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2419 return false;
2420 }
2421
2422 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2424 const DebugLoc &DL = I.getDebugLoc();
2425 MachineBasicBlock *MBB = I.getParent();
2426 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2427 .addReg(SrcReg, 0, AMDGPU::lo16);
2428 I.eraseFromParent();
2429 return true;
2430 }
2431
2432 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2433 MachineBasicBlock *MBB = I.getParent();
2434 const DebugLoc &DL = I.getDebugLoc();
2435
2436 Register LoReg = MRI->createVirtualRegister(DstRC);
2437 Register HiReg = MRI->createVirtualRegister(DstRC);
2438 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2439 .addReg(SrcReg, 0, AMDGPU::sub0);
2440 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2441 .addReg(SrcReg, 0, AMDGPU::sub1);
2442
2443 if (IsVALU && STI.hasSDWA()) {
2444 // Write the low 16-bits of the high element into the high 16-bits of the
2445 // low element.
2446 MachineInstr *MovSDWA =
2447 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2448 .addImm(0) // $src0_modifiers
2449 .addReg(HiReg) // $src0
2450 .addImm(0) // $clamp
2451 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2452 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2453 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2454 .addReg(LoReg, RegState::Implicit);
2455 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2456 } else {
2457 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2458 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2459 Register ImmReg = MRI->createVirtualRegister(DstRC);
2460 if (IsVALU) {
2461 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2462 .addImm(16)
2463 .addReg(HiReg);
2464 } else {
2465 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2466 .addReg(HiReg)
2467 .addImm(16)
2468 .setOperandDead(3); // Dead scc
2469 }
2470
2471 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2472 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2473 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2474
2475 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2476 .addImm(0xffff);
2477 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2478 .addReg(LoReg)
2479 .addReg(ImmReg);
2480 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2481 .addReg(TmpReg0)
2482 .addReg(TmpReg1);
2483
2484 if (!IsVALU) {
2485 And.setOperandDead(3); // Dead scc
2486 Or.setOperandDead(3); // Dead scc
2487 }
2488 }
2489
2490 I.eraseFromParent();
2491 return true;
2492 }
2493
2494 if (!DstTy.isScalar())
2495 return false;
2496
2497 if (SrcSize > 32) {
2498 unsigned SubRegIdx =
2499 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2500 if (SubRegIdx == AMDGPU::NoSubRegister)
2501 return false;
2502
2503 // Deal with weird cases where the class only partially supports the subreg
2504 // index.
2505 const TargetRegisterClass *SrcWithSubRC
2506 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2507 if (!SrcWithSubRC)
2508 return false;
2509
2510 if (SrcWithSubRC != SrcRC) {
2511 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2512 return false;
2513 }
2514
2515 I.getOperand(1).setSubReg(SubRegIdx);
2516 }
2517
2518 I.setDesc(TII.get(TargetOpcode::COPY));
2519 return true;
2520}
2521
2522/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2523static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2524 Mask = maskTrailingOnes<unsigned>(Size);
2525 int SignedMask = static_cast<int>(Mask);
2526 return SignedMask >= -16 && SignedMask <= 64;
2527}
2528
2529// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2530const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2531 Register Reg, const MachineRegisterInfo &MRI,
2532 const TargetRegisterInfo &TRI) const {
2533 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2534 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2535 return RB;
2536
2537 // Ignore the type, since we don't use vcc in artifacts.
2538 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2539 return &RBI.getRegBankFromRegClass(*RC, LLT());
2540 return nullptr;
2541}
2542
2543bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2544 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2545 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2546 const DebugLoc &DL = I.getDebugLoc();
2547 MachineBasicBlock &MBB = *I.getParent();
2548 const Register DstReg = I.getOperand(0).getReg();
2549 const Register SrcReg = I.getOperand(1).getReg();
2550
2551 const LLT DstTy = MRI->getType(DstReg);
2552 const LLT SrcTy = MRI->getType(SrcReg);
2553 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2554 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2555 const unsigned DstSize = DstTy.getSizeInBits();
2556 if (!DstTy.isScalar())
2557 return false;
2558
2559 // Artifact casts should never use vcc.
2560 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2561
2562 // FIXME: This should probably be illegal and split earlier.
2563 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2564 if (DstSize <= 32)
2565 return selectCOPY(I);
2566
2567 const TargetRegisterClass *SrcRC =
2568 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2569 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2570 const TargetRegisterClass *DstRC =
2571 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2572
2573 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2574 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2575 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2576 .addReg(SrcReg)
2577 .addImm(AMDGPU::sub0)
2578 .addReg(UndefReg)
2579 .addImm(AMDGPU::sub1);
2580 I.eraseFromParent();
2581
2582 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2583 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2584 }
2585
2586 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2587 // 64-bit should have been split up in RegBankSelect
2588
2589 // Try to use an and with a mask if it will save code size.
2590 unsigned Mask;
2591 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2592 MachineInstr *ExtI =
2593 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2594 .addImm(Mask)
2595 .addReg(SrcReg);
2596 I.eraseFromParent();
2597 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2598 }
2599
2600 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2601 MachineInstr *ExtI =
2602 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2603 .addReg(SrcReg)
2604 .addImm(0) // Offset
2605 .addImm(SrcSize); // Width
2606 I.eraseFromParent();
2607 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2608 }
2609
2610 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2611 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2612 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2613 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2614 return false;
2615
2616 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2617 const unsigned SextOpc = SrcSize == 8 ?
2618 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2619 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2620 .addReg(SrcReg);
2621 I.eraseFromParent();
2622 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2623 }
2624
2625 // Using a single 32-bit SALU to calculate the high half is smaller than
2626 // S_BFE with a literal constant operand.
2627 if (DstSize > 32 && SrcSize == 32) {
2628 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2629 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2630 if (Signed) {
2631 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2632 .addReg(SrcReg, 0, SubReg)
2633 .addImm(31)
2634 .setOperandDead(3); // Dead scc
2635 } else {
2636 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2637 .addImm(0);
2638 }
2639 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2640 .addReg(SrcReg, 0, SubReg)
2641 .addImm(AMDGPU::sub0)
2642 .addReg(HiReg)
2643 .addImm(AMDGPU::sub1);
2644 I.eraseFromParent();
2645 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2646 *MRI);
2647 }
2648
2649 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2650 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2651
2652 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2653 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2654 // We need a 64-bit register source, but the high bits don't matter.
2655 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2656 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2657 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2658
2659 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2660 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2661 .addReg(SrcReg, 0, SubReg)
2662 .addImm(AMDGPU::sub0)
2663 .addReg(UndefReg)
2664 .addImm(AMDGPU::sub1);
2665
2666 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2667 .addReg(ExtReg)
2668 .addImm(SrcSize << 16);
2669
2670 I.eraseFromParent();
2671 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2672 }
2673
2674 unsigned Mask;
2675 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2676 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2677 .addReg(SrcReg)
2678 .addImm(Mask)
2679 .setOperandDead(3); // Dead scc
2680 } else {
2681 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2682 .addReg(SrcReg)
2683 .addImm(SrcSize << 16);
2684 }
2685
2686 I.eraseFromParent();
2687 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2688 }
2689
2690 return false;
2691}
2692
2694 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2695}
2696
2698 Register BitcastSrc;
2699 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2700 Reg = BitcastSrc;
2701 return Reg;
2702}
2703
2705 Register &Out) {
2706 Register Trunc;
2707 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2708 return false;
2709
2710 Register LShlSrc;
2711 Register Cst;
2712 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2713 Cst = stripCopy(Cst, MRI);
2714 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2715 Out = stripBitCast(LShlSrc, MRI);
2716 return true;
2717 }
2718 }
2719
2720 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2721 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2722 return false;
2723
2724 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2725 LLT::fixed_vector(2, 16));
2726
2727 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2728 assert(Mask.size() == 2);
2729
2730 if (Mask[0] == 1 && Mask[1] <= 1) {
2731 Out = Shuffle->getOperand(0).getReg();
2732 return true;
2733 }
2734
2735 return false;
2736}
2737
2738bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2739 if (!Subtarget->hasSALUFloatInsts())
2740 return false;
2741
2742 Register Dst = I.getOperand(0).getReg();
2743 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2744 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2745 return false;
2746
2747 Register Src = I.getOperand(1).getReg();
2748
2749 if (MRI->getType(Dst) == LLT::scalar(32) &&
2750 MRI->getType(Src) == LLT::scalar(16)) {
2751 if (isExtractHiElt(*MRI, Src, Src)) {
2752 MachineBasicBlock *BB = I.getParent();
2753 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2754 .addUse(Src);
2755 I.eraseFromParent();
2756 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2757 }
2758 }
2759
2760 return false;
2761}
2762
2763bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2764 // Only manually handle the f64 SGPR case.
2765 //
2766 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2767 // the bit ops theoretically have a second result due to the implicit def of
2768 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2769 // that is easy by disabling the check. The result works, but uses a
2770 // nonsensical sreg32orlds_and_sreg_1 regclass.
2771 //
2772 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2773 // the variadic REG_SEQUENCE operands.
2774
2775 Register Dst = MI.getOperand(0).getReg();
2776 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2777 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2778 MRI->getType(Dst) != LLT::scalar(64))
2779 return false;
2780
2781 Register Src = MI.getOperand(1).getReg();
2782 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2783 if (Fabs)
2784 Src = Fabs->getOperand(1).getReg();
2785
2786 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2787 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2788 return false;
2789
2790 MachineBasicBlock *BB = MI.getParent();
2791 const DebugLoc &DL = MI.getDebugLoc();
2792 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2793 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2794 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2795 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2796
2797 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2798 .addReg(Src, 0, AMDGPU::sub0);
2799 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2800 .addReg(Src, 0, AMDGPU::sub1);
2801 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2802 .addImm(0x80000000);
2803
2804 // Set or toggle sign bit.
2805 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2806 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2807 .addReg(HiReg)
2808 .addReg(ConstReg)
2809 .setOperandDead(3); // Dead scc
2810 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2811 .addReg(LoReg)
2812 .addImm(AMDGPU::sub0)
2813 .addReg(OpReg)
2814 .addImm(AMDGPU::sub1);
2815 MI.eraseFromParent();
2816 return true;
2817}
2818
2819// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2820bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2821 Register Dst = MI.getOperand(0).getReg();
2822 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2823 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2824 MRI->getType(Dst) != LLT::scalar(64))
2825 return false;
2826
2827 Register Src = MI.getOperand(1).getReg();
2828 MachineBasicBlock *BB = MI.getParent();
2829 const DebugLoc &DL = MI.getDebugLoc();
2830 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2831 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2832 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2833 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2834
2835 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2836 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2837 return false;
2838
2839 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2840 .addReg(Src, 0, AMDGPU::sub0);
2841 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2842 .addReg(Src, 0, AMDGPU::sub1);
2843 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2844 .addImm(0x7fffffff);
2845
2846 // Clear sign bit.
2847 // TODO: Should this used S_BITSET0_*?
2848 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2849 .addReg(HiReg)
2850 .addReg(ConstReg)
2851 .setOperandDead(3); // Dead scc
2852 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2853 .addReg(LoReg)
2854 .addImm(AMDGPU::sub0)
2855 .addReg(OpReg)
2856 .addImm(AMDGPU::sub1);
2857
2858 MI.eraseFromParent();
2859 return true;
2860}
2861
2862static bool isConstant(const MachineInstr &MI) {
2863 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2864}
2865
2866void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2867 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2868
2869 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2870 const MachineInstr *PtrMI =
2871 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2872
2873 assert(PtrMI);
2874
2875 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2876 return;
2877
2878 GEPInfo GEPInfo;
2879
2880 for (unsigned i = 1; i != 3; ++i) {
2881 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2882 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2883 assert(OpDef);
2884 if (i == 2 && isConstant(*OpDef)) {
2885 // TODO: Could handle constant base + variable offset, but a combine
2886 // probably should have commuted it.
2887 assert(GEPInfo.Imm == 0);
2888 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2889 continue;
2890 }
2891 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2892 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2893 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2894 else
2895 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2896 }
2897
2898 AddrInfo.push_back(GEPInfo);
2899 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2900}
2901
2902bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2903 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2904}
2905
2906bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2907 if (!MI.hasOneMemOperand())
2908 return false;
2909
2910 const MachineMemOperand *MMO = *MI.memoperands_begin();
2911 const Value *Ptr = MMO->getValue();
2912
2913 // UndefValue means this is a load of a kernel input. These are uniform.
2914 // Sometimes LDS instructions have constant pointers.
2915 // If Ptr is null, then that means this mem operand contains a
2916 // PseudoSourceValue like GOT.
2917 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2918 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2919 return true;
2920
2922 return true;
2923
2924 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2925 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2926 AMDGPU::SGPRRegBankID;
2927
2928 const Instruction *I = dyn_cast<Instruction>(Ptr);
2929 return I && I->getMetadata("amdgpu.uniform");
2930}
2931
2932bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2933 for (const GEPInfo &GEPInfo : AddrInfo) {
2934 if (!GEPInfo.VgprParts.empty())
2935 return true;
2936 }
2937 return false;
2938}
2939
2940void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2941 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2942 unsigned AS = PtrTy.getAddressSpace();
2944 STI.ldsRequiresM0Init()) {
2945 MachineBasicBlock *BB = I.getParent();
2946
2947 // If DS instructions require M0 initialization, insert it before selecting.
2948 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2949 .addImm(-1);
2950 }
2951}
2952
2953bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2954 MachineInstr &I) const {
2955 initM0(I);
2956 return selectImpl(I, *CoverageInfo);
2957}
2958
2960 if (Reg.isPhysical())
2961 return false;
2962
2963 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2964 const unsigned Opcode = MI.getOpcode();
2965
2966 if (Opcode == AMDGPU::COPY)
2967 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2968
2969 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2970 Opcode == AMDGPU::G_XOR)
2971 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2972 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2973
2974 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2975 return GI->is(Intrinsic::amdgcn_class);
2976
2977 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2978}
2979
2980bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2981 MachineBasicBlock *BB = I.getParent();
2982 MachineOperand &CondOp = I.getOperand(0);
2983 Register CondReg = CondOp.getReg();
2984 const DebugLoc &DL = I.getDebugLoc();
2985
2986 unsigned BrOpcode;
2987 Register CondPhysReg;
2988 const TargetRegisterClass *ConstrainRC;
2989
2990 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2991 // whether the branch is uniform when selecting the instruction. In
2992 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2993 // RegBankSelect knows what it's doing if the branch condition is scc, even
2994 // though it currently does not.
2995 if (!isVCC(CondReg, *MRI)) {
2996 if (MRI->getType(CondReg) != LLT::scalar(32))
2997 return false;
2998
2999 CondPhysReg = AMDGPU::SCC;
3000 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3001 ConstrainRC = &AMDGPU::SReg_32RegClass;
3002 } else {
3003 // FIXME: Should scc->vcc copies and with exec?
3004
3005 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3006 // need to insert an and with exec.
3007 if (!isVCmpResult(CondReg, *MRI)) {
3008 const bool Is64 = STI.isWave64();
3009 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3010 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3011
3012 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3013 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3014 .addReg(CondReg)
3015 .addReg(Exec)
3016 .setOperandDead(3); // Dead scc
3017 CondReg = TmpReg;
3018 }
3019
3020 CondPhysReg = TRI.getVCC();
3021 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3022 ConstrainRC = TRI.getBoolRC();
3023 }
3024
3025 if (!MRI->getRegClassOrNull(CondReg))
3026 MRI->setRegClass(CondReg, ConstrainRC);
3027
3028 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3029 .addReg(CondReg);
3030 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3031 .addMBB(I.getOperand(1).getMBB());
3032
3033 I.eraseFromParent();
3034 return true;
3035}
3036
3037bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3038 MachineInstr &I) const {
3039 Register DstReg = I.getOperand(0).getReg();
3040 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3041 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3042 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3043 if (IsVGPR)
3044 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3045
3046 return RBI.constrainGenericRegister(
3047 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3048}
3049
3050bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3051 Register DstReg = I.getOperand(0).getReg();
3052 Register SrcReg = I.getOperand(1).getReg();
3053 Register MaskReg = I.getOperand(2).getReg();
3054 LLT Ty = MRI->getType(DstReg);
3055 LLT MaskTy = MRI->getType(MaskReg);
3056 MachineBasicBlock *BB = I.getParent();
3057 const DebugLoc &DL = I.getDebugLoc();
3058
3059 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3060 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3061 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3062 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3063 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3064 return false;
3065
3066 // Try to avoid emitting a bit operation when we only need to touch half of
3067 // the 64-bit pointer.
3068 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
3069 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3070 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3071
3072 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3073 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3074
3075 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3076 !CanCopyLow32 && !CanCopyHi32) {
3077 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3078 .addReg(SrcReg)
3079 .addReg(MaskReg)
3080 .setOperandDead(3); // Dead scc
3081 I.eraseFromParent();
3082 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3083 }
3084
3085 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3086 const TargetRegisterClass &RegRC
3087 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3088
3089 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3090 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3091 const TargetRegisterClass *MaskRC =
3092 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3093
3094 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3095 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3096 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3097 return false;
3098
3099 if (Ty.getSizeInBits() == 32) {
3100 assert(MaskTy.getSizeInBits() == 32 &&
3101 "ptrmask should have been narrowed during legalize");
3102
3103 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3104 .addReg(SrcReg)
3105 .addReg(MaskReg);
3106
3107 if (!IsVGPR)
3108 NewOp.setOperandDead(3); // Dead scc
3109 I.eraseFromParent();
3110 return true;
3111 }
3112
3113 Register HiReg = MRI->createVirtualRegister(&RegRC);
3114 Register LoReg = MRI->createVirtualRegister(&RegRC);
3115
3116 // Extract the subregisters from the source pointer.
3117 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3118 .addReg(SrcReg, 0, AMDGPU::sub0);
3119 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3120 .addReg(SrcReg, 0, AMDGPU::sub1);
3121
3122 Register MaskedLo, MaskedHi;
3123
3124 if (CanCopyLow32) {
3125 // If all the bits in the low half are 1, we only need a copy for it.
3126 MaskedLo = LoReg;
3127 } else {
3128 // Extract the mask subregister and apply the and.
3129 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3130 MaskedLo = MRI->createVirtualRegister(&RegRC);
3131
3132 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3133 .addReg(MaskReg, 0, AMDGPU::sub0);
3134 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3135 .addReg(LoReg)
3136 .addReg(MaskLo);
3137 }
3138
3139 if (CanCopyHi32) {
3140 // If all the bits in the high half are 1, we only need a copy for it.
3141 MaskedHi = HiReg;
3142 } else {
3143 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3144 MaskedHi = MRI->createVirtualRegister(&RegRC);
3145
3146 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3147 .addReg(MaskReg, 0, AMDGPU::sub1);
3148 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3149 .addReg(HiReg)
3150 .addReg(MaskHi);
3151 }
3152
3153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3154 .addReg(MaskedLo)
3155 .addImm(AMDGPU::sub0)
3156 .addReg(MaskedHi)
3157 .addImm(AMDGPU::sub1);
3158 I.eraseFromParent();
3159 return true;
3160}
3161
3162/// Return the register to use for the index value, and the subregister to use
3163/// for the indirectly accessed register.
3164static std::pair<Register, unsigned>
3166 const TargetRegisterClass *SuperRC, Register IdxReg,
3167 unsigned EltSize, GISelKnownBits &KnownBits) {
3168 Register IdxBaseReg;
3169 int Offset;
3170
3171 std::tie(IdxBaseReg, Offset) =
3173 if (IdxBaseReg == AMDGPU::NoRegister) {
3174 // This will happen if the index is a known constant. This should ordinarily
3175 // be legalized out, but handle it as a register just in case.
3176 assert(Offset == 0);
3177 IdxBaseReg = IdxReg;
3178 }
3179
3180 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3181
3182 // Skip out of bounds offsets, or else we would end up using an undefined
3183 // register.
3184 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3185 return std::pair(IdxReg, SubRegs[0]);
3186 return std::pair(IdxBaseReg, SubRegs[Offset]);
3187}
3188
3189bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3190 MachineInstr &MI) const {
3191 Register DstReg = MI.getOperand(0).getReg();
3192 Register SrcReg = MI.getOperand(1).getReg();
3193 Register IdxReg = MI.getOperand(2).getReg();
3194
3195 LLT DstTy = MRI->getType(DstReg);
3196 LLT SrcTy = MRI->getType(SrcReg);
3197
3198 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3199 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3200 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3201
3202 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3203 // into a waterfall loop.
3204 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3205 return false;
3206
3207 const TargetRegisterClass *SrcRC =
3208 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3209 const TargetRegisterClass *DstRC =
3210 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3211 if (!SrcRC || !DstRC)
3212 return false;
3213 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3214 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3215 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3216 return false;
3217
3218 MachineBasicBlock *BB = MI.getParent();
3219 const DebugLoc &DL = MI.getDebugLoc();
3220 const bool Is64 = DstTy.getSizeInBits() == 64;
3221
3222 unsigned SubReg;
3223 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3224 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3225
3226 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3227 if (DstTy.getSizeInBits() != 32 && !Is64)
3228 return false;
3229
3230 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3231 .addReg(IdxReg);
3232
3233 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3234 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3235 .addReg(SrcReg, 0, SubReg)
3236 .addReg(SrcReg, RegState::Implicit);
3237 MI.eraseFromParent();
3238 return true;
3239 }
3240
3241 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3242 return false;
3243
3244 if (!STI.useVGPRIndexMode()) {
3245 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3246 .addReg(IdxReg);
3247 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3248 .addReg(SrcReg, 0, SubReg)
3249 .addReg(SrcReg, RegState::Implicit);
3250 MI.eraseFromParent();
3251 return true;
3252 }
3253
3254 const MCInstrDesc &GPRIDXDesc =
3255 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3256 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3257 .addReg(SrcReg)
3258 .addReg(IdxReg)
3259 .addImm(SubReg);
3260
3261 MI.eraseFromParent();
3262 return true;
3263}
3264
3265// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3266bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3267 MachineInstr &MI) const {
3268 Register DstReg = MI.getOperand(0).getReg();
3269 Register VecReg = MI.getOperand(1).getReg();
3270 Register ValReg = MI.getOperand(2).getReg();
3271 Register IdxReg = MI.getOperand(3).getReg();
3272
3273 LLT VecTy = MRI->getType(DstReg);
3274 LLT ValTy = MRI->getType(ValReg);
3275 unsigned VecSize = VecTy.getSizeInBits();
3276 unsigned ValSize = ValTy.getSizeInBits();
3277
3278 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3279 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3280 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3281
3282 assert(VecTy.getElementType() == ValTy);
3283
3284 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3285 // into a waterfall loop.
3286 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3287 return false;
3288
3289 const TargetRegisterClass *VecRC =
3290 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3291 const TargetRegisterClass *ValRC =
3292 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3293
3294 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3295 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3296 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3297 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3298 return false;
3299
3300 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3301 return false;
3302
3303 unsigned SubReg;
3304 std::tie(IdxReg, SubReg) =
3305 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3306
3307 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3308 STI.useVGPRIndexMode();
3309
3310 MachineBasicBlock *BB = MI.getParent();
3311 const DebugLoc &DL = MI.getDebugLoc();
3312
3313 if (!IndexMode) {
3314 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3315 .addReg(IdxReg);
3316
3317 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3318 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3319 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3320 .addReg(VecReg)
3321 .addReg(ValReg)
3322 .addImm(SubReg);
3323 MI.eraseFromParent();
3324 return true;
3325 }
3326
3327 const MCInstrDesc &GPRIDXDesc =
3328 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3329 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3330 .addReg(VecReg)
3331 .addReg(ValReg)
3332 .addReg(IdxReg)
3333 .addImm(SubReg);
3334
3335 MI.eraseFromParent();
3336 return true;
3337}
3338
3339bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3341 unsigned Opc;
3342 unsigned Size = MI.getOperand(3).getImm();
3343
3344 // The struct intrinsic variants add one additional operand over raw.
3345 const bool HasVIndex = MI.getNumOperands() == 9;
3346 Register VIndex;
3347 int OpOffset = 0;
3348 if (HasVIndex) {
3349 VIndex = MI.getOperand(4).getReg();
3350 OpOffset = 1;
3351 }
3352
3353 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3354 std::optional<ValueAndVReg> MaybeVOffset =
3356 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3357
3358 switch (Size) {
3359 default:
3360 return false;
3361 case 1:
3362 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3363 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3364 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3365 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3366 break;
3367 case 2:
3368 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3369 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3370 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3371 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3372 break;
3373 case 4:
3374 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3375 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3376 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3377 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3378 break;
3379 case 12:
3380 if (!Subtarget->hasLDSLoadB96_B128())
3381 return false;
3382
3383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3384 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3385 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3386 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3387 break;
3388 case 16:
3389 if (!Subtarget->hasLDSLoadB96_B128())
3390 return false;
3391
3392 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3393 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3394 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3395 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3396 break;
3397 }
3398
3399 MachineBasicBlock *MBB = MI.getParent();
3400 const DebugLoc &DL = MI.getDebugLoc();
3401 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3402 .add(MI.getOperand(2));
3403
3404 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3405
3406 if (HasVIndex && HasVOffset) {
3407 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3408 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3409 .addReg(VIndex)
3410 .addImm(AMDGPU::sub0)
3411 .addReg(VOffset)
3412 .addImm(AMDGPU::sub1);
3413
3414 MIB.addReg(IdxReg);
3415 } else if (HasVIndex) {
3416 MIB.addReg(VIndex);
3417 } else if (HasVOffset) {
3418 MIB.addReg(VOffset);
3419 }
3420
3421 MIB.add(MI.getOperand(1)); // rsrc
3422 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3423 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3424 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3425 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3426 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3427 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3428 MIB.addImm(
3429 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3430 ? 1
3431 : 0); // swz
3432
3433 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3434 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3435 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3436 MachinePointerInfo StorePtrI = LoadPtrI;
3437 StorePtrI.V = nullptr;
3439
3440 auto F = LoadMMO->getFlags() &
3442 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3443 Size, LoadMMO->getBaseAlign());
3444
3445 MachineMemOperand *StoreMMO =
3447 sizeof(int32_t), LoadMMO->getBaseAlign());
3448
3449 MIB.setMemRefs({LoadMMO, StoreMMO});
3450
3451 MI.eraseFromParent();
3452 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3453}
3454
3455/// Match a zero extend from a 32-bit value to 64-bits.
3457 Register ZExtSrc;
3458 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3459 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3460
3461 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3462 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3463 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3464 return Register();
3465
3466 assert(Def->getNumOperands() == 3 &&
3467 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3468 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3469 return Def->getOperand(1).getReg();
3470 }
3471
3472 return Register();
3473}
3474
3475bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3476 unsigned Opc;
3477 unsigned Size = MI.getOperand(3).getImm();
3478
3479 switch (Size) {
3480 default:
3481 return false;
3482 case 1:
3483 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3484 break;
3485 case 2:
3486 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3487 break;
3488 case 4:
3489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3490 break;
3491 case 12:
3492 if (!Subtarget->hasLDSLoadB96_B128())
3493 return false;
3494 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3495 break;
3496 case 16:
3497 if (!Subtarget->hasLDSLoadB96_B128())
3498 return false;
3499 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3500 break;
3501 }
3502
3503 MachineBasicBlock *MBB = MI.getParent();
3504 const DebugLoc &DL = MI.getDebugLoc();
3505 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3506 .add(MI.getOperand(2));
3507
3508 Register Addr = MI.getOperand(1).getReg();
3509 Register VOffset;
3510 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3511 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3512 if (!isSGPR(Addr)) {
3513 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3514 if (isSGPR(AddrDef->Reg)) {
3515 Addr = AddrDef->Reg;
3516 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3517 Register SAddr =
3518 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3519 if (isSGPR(SAddr)) {
3520 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3521 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3522 Addr = SAddr;
3523 VOffset = Off;
3524 }
3525 }
3526 }
3527 }
3528
3529 if (isSGPR(Addr)) {
3530 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3531 if (!VOffset) {
3532 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3533 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3534 .addImm(0);
3535 }
3536 }
3537
3538 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3539 .addReg(Addr);
3540
3541 if (isSGPR(Addr))
3542 MIB.addReg(VOffset);
3543
3544 MIB.add(MI.getOperand(4)) // offset
3545 .add(MI.getOperand(5)); // cpol
3546
3547 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3548 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3549 LoadPtrI.Offset = MI.getOperand(4).getImm();
3550 MachinePointerInfo StorePtrI = LoadPtrI;
3553 auto F = LoadMMO->getFlags() &
3555 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3556 Size, LoadMMO->getBaseAlign());
3557 MachineMemOperand *StoreMMO =
3559 sizeof(int32_t), Align(4));
3560
3561 MIB.setMemRefs({LoadMMO, StoreMMO});
3562
3563 MI.eraseFromParent();
3564 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3565}
3566
3567bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3568 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3569 MI.removeOperand(1);
3570 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3571 return true;
3572}
3573
3574// FIXME: This should be removed and let the patterns select. We just need the
3575// AGPR/VGPR combination versions.
3576bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3577 unsigned Opc;
3578 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3579 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3580 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3581 break;
3582 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3583 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3584 break;
3585 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3586 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3587 break;
3588 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3589 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3590 break;
3591 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3592 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3593 break;
3594 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3595 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3596 break;
3597 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3598 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3599 break;
3600 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3601 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3602 break;
3603 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3604 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3605 break;
3606 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3607 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3608 break;
3609 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3610 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3611 break;
3612 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3613 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3614 break;
3615 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3616 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3617 break;
3618 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3619 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3620 break;
3621 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3622 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3623 break;
3624 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3625 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3626 break;
3627 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3628 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3629 break;
3630 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3631 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3632 break;
3633 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3634 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3635 break;
3636 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3637 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3638 break;
3639 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3640 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3641 break;
3642 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3643 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3644 break;
3645 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3646 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3647 break;
3648 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3649 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3650 break;
3651 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3652 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3653 break;
3654 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3655 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3656 break;
3657 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3658 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3659 break;
3660 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3661 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3662 break;
3663 default:
3664 llvm_unreachable("unhandled smfmac intrinsic");
3665 }
3666
3667 auto VDst_In = MI.getOperand(4);
3668
3669 MI.setDesc(TII.get(Opc));
3670 MI.removeOperand(4); // VDst_In
3671 MI.removeOperand(1); // Intrinsic ID
3672 MI.addOperand(VDst_In); // Readd VDst_In to the end
3673 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3674 return true;
3675}
3676
3677bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3678 MachineInstr &MI, Intrinsic::ID IntrID) const {
3679 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3680 !Subtarget->hasPermlane16Swap())
3681 return false;
3682 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3683 !Subtarget->hasPermlane32Swap())
3684 return false;
3685
3686 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3687 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3688 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3689
3690 MI.removeOperand(2);
3691 MI.setDesc(TII.get(Opcode));
3692 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3693
3694 MachineOperand &FI = MI.getOperand(4);
3696
3697 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3698}
3699
3700bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3701 Register DstReg = MI.getOperand(0).getReg();
3702 Register SrcReg = MI.getOperand(1).getReg();
3703 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3704 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3705 MachineBasicBlock *MBB = MI.getParent();
3706 const DebugLoc &DL = MI.getDebugLoc();
3707
3708 if (IsVALU) {
3709 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3710 .addImm(Subtarget->getWavefrontSizeLog2())
3711 .addReg(SrcReg);
3712 } else {
3713 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3714 .addReg(SrcReg)
3715 .addImm(Subtarget->getWavefrontSizeLog2())
3716 .setOperandDead(3); // Dead scc
3717 }
3718
3719 const TargetRegisterClass &RC =
3720 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3721 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3722 return false;
3723
3724 MI.eraseFromParent();
3725 return true;
3726}
3727
3728// Match BITOP3 operation and return a number of matched instructions plus
3729// truth table.
3730static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3732 const MachineRegisterInfo &MRI) {
3733 unsigned NumOpcodes = 0;
3734 uint8_t LHSBits, RHSBits;
3735
3736 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3737 // Define truth table given Src0, Src1, Src2 bits permutations:
3738 // 0 0 0
3739 // 0 0 1
3740 // 0 1 0
3741 // 0 1 1
3742 // 1 0 0
3743 // 1 0 1
3744 // 1 1 0
3745 // 1 1 1
3746 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3747
3748 if (mi_match(Op, MRI, m_AllOnesInt())) {
3749 Bits = 0xff;
3750 return true;
3751 }
3752 if (mi_match(Op, MRI, m_ZeroInt())) {
3753 Bits = 0;
3754 return true;
3755 }
3756
3757 for (unsigned I = 0; I < Src.size(); ++I) {
3758 // Try to find existing reused operand
3759 if (Src[I] == Op) {
3760 Bits = SrcBits[I];
3761 return true;
3762 }
3763 // Try to replace parent operator
3764 if (Src[I] == R) {
3765 Bits = SrcBits[I];
3766 Src[I] = Op;
3767 return true;
3768 }
3769 }
3770
3771 if (Src.size() == 3) {
3772 // No room left for operands. Try one last time, there can be a 'not' of
3773 // one of our source operands. In this case we can compute the bits
3774 // without growing Src vector.
3775 Register LHS;
3776 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3778 for (unsigned I = 0; I < Src.size(); ++I) {
3779 if (Src[I] == LHS) {
3780 Bits = ~SrcBits[I];
3781 return true;
3782 }
3783 }
3784 }
3785
3786 return false;
3787 }
3788
3789 Bits = SrcBits[Src.size()];
3790 Src.push_back(Op);
3791 return true;
3792 };
3793
3794 MachineInstr *MI = MRI.getVRegDef(R);
3795 switch (MI->getOpcode()) {
3796 case TargetOpcode::G_AND:
3797 case TargetOpcode::G_OR:
3798 case TargetOpcode::G_XOR: {
3799 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3800 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3801
3802 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3803 if (!getOperandBits(LHS, LHSBits) ||
3804 !getOperandBits(RHS, RHSBits)) {
3805 Src = Backup;
3806 return std::make_pair(0, 0);
3807 }
3808
3809 // Recursion is naturally limited by the size of the operand vector.
3810 auto Op = BitOp3_Op(LHS, Src, MRI);
3811 if (Op.first) {
3812 NumOpcodes += Op.first;
3813 LHSBits = Op.second;
3814 }
3815
3816 Op = BitOp3_Op(RHS, Src, MRI);
3817 if (Op.first) {
3818 NumOpcodes += Op.first;
3819 RHSBits = Op.second;
3820 }
3821 break;
3822 }
3823 default:
3824 return std::make_pair(0, 0);
3825 }
3826
3827 uint8_t TTbl;
3828 switch (MI->getOpcode()) {
3829 case TargetOpcode::G_AND:
3830 TTbl = LHSBits & RHSBits;
3831 break;
3832 case TargetOpcode::G_OR:
3833 TTbl = LHSBits | RHSBits;
3834 break;
3835 case TargetOpcode::G_XOR:
3836 TTbl = LHSBits ^ RHSBits;
3837 break;
3838 default:
3839 break;
3840 }
3841
3842 return std::make_pair(NumOpcodes + 1, TTbl);
3843}
3844
3845bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3846 if (!Subtarget->hasBitOp3Insts())
3847 return false;
3848
3849 Register DstReg = MI.getOperand(0).getReg();
3850 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3851 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3852 if (!IsVALU)
3853 return false;
3854
3856 uint8_t TTbl;
3857 unsigned NumOpcodes;
3858
3859 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3860
3861 // Src.empty() case can happen if all operands are all zero or all ones.
3862 // Normally it shall be optimized out before reaching this.
3863 if (NumOpcodes < 2 || Src.empty())
3864 return false;
3865
3866 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3867 if (NumOpcodes == 2 && IsB32) {
3868 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3869 // asm more readable. This cannot be modeled with AddedComplexity because
3870 // selector does not know how many operations did we match.
3871 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3872 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3873 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3874 return false;
3875 } else if (NumOpcodes < 4) {
3876 // For a uniform case threshold should be higher to account for moves
3877 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3878 // in SGPRs and a readtfirstlane after.
3879 return false;
3880 }
3881
3882 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3883 unsigned CBL = STI.getConstantBusLimit(Opc);
3884 MachineBasicBlock *MBB = MI.getParent();
3885 const DebugLoc &DL = MI.getDebugLoc();
3886
3887 for (unsigned I = 0; I < Src.size(); ++I) {
3888 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3889 if (RB->getID() != AMDGPU::SGPRRegBankID)
3890 continue;
3891 if (CBL > 0) {
3892 --CBL;
3893 continue;
3894 }
3895 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3896 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3897 .addReg(Src[I]);
3898 Src[I] = NewReg;
3899 }
3900
3901 // Last operand can be ignored, turning a ternary operation into a binary.
3902 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3903 // 'c' with 'a' here without changing the answer. In some pathological
3904 // cases it should be possible to get an operation with a single operand
3905 // too if optimizer would not catch it.
3906 while (Src.size() < 3)
3907 Src.push_back(Src[0]);
3908
3909 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3910 if (!IsB32)
3911 MIB.addImm(0); // src_mod0
3912 MIB.addReg(Src[0]);
3913 if (!IsB32)
3914 MIB.addImm(0); // src_mod1
3915 MIB.addReg(Src[1]);
3916 if (!IsB32)
3917 MIB.addImm(0); // src_mod2
3918 MIB.addReg(Src[2])
3919 .addImm(TTbl);
3920 if (!IsB32)
3921 MIB.addImm(0); // op_sel
3922
3923 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3924 MI.eraseFromParent();
3925
3926 return true;
3927}
3928
3929bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3930 Register SrcReg = MI.getOperand(0).getReg();
3931 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3932 return false;
3933
3934 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3935 Register SP =
3937 Register WaveAddr = getWaveAddress(DefMI);
3938 MachineBasicBlock *MBB = MI.getParent();
3939 const DebugLoc &DL = MI.getDebugLoc();
3940
3941 if (!WaveAddr) {
3942 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3943 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3944 .addReg(SrcReg)
3945 .addImm(Subtarget->getWavefrontSizeLog2())
3946 .setOperandDead(3); // Dead scc
3947 }
3948
3949 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3950 .addReg(WaveAddr);
3951
3952 MI.eraseFromParent();
3953 return true;
3954}
3955
3957
3958 if (!I.isPreISelOpcode()) {
3959 if (I.isCopy())
3960 return selectCOPY(I);
3961 return true;
3962 }
3963
3964 switch (I.getOpcode()) {
3965 case TargetOpcode::G_AND:
3966 case TargetOpcode::G_OR:
3967 case TargetOpcode::G_XOR:
3968 if (selectBITOP3(I))
3969 return true;
3970 if (selectImpl(I, *CoverageInfo))
3971 return true;
3972 return selectG_AND_OR_XOR(I);
3973 case TargetOpcode::G_ADD:
3974 case TargetOpcode::G_SUB:
3975 case TargetOpcode::G_PTR_ADD:
3976 if (selectImpl(I, *CoverageInfo))
3977 return true;
3978 return selectG_ADD_SUB(I);
3979 case TargetOpcode::G_UADDO:
3980 case TargetOpcode::G_USUBO:
3981 case TargetOpcode::G_UADDE:
3982 case TargetOpcode::G_USUBE:
3983 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3984 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3985 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3986 return selectG_AMDGPU_MAD_64_32(I);
3987 case TargetOpcode::G_INTTOPTR:
3988 case TargetOpcode::G_BITCAST:
3989 case TargetOpcode::G_PTRTOINT:
3990 case TargetOpcode::G_FREEZE:
3991 return selectCOPY(I);
3992 case TargetOpcode::G_FNEG:
3993 if (selectImpl(I, *CoverageInfo))
3994 return true;
3995 return selectG_FNEG(I);
3996 case TargetOpcode::G_FABS:
3997 if (selectImpl(I, *CoverageInfo))
3998 return true;
3999 return selectG_FABS(I);
4000 case TargetOpcode::G_EXTRACT:
4001 return selectG_EXTRACT(I);
4002 case TargetOpcode::G_MERGE_VALUES:
4003 case TargetOpcode::G_CONCAT_VECTORS:
4004 return selectG_MERGE_VALUES(I);
4005 case TargetOpcode::G_UNMERGE_VALUES:
4006 return selectG_UNMERGE_VALUES(I);
4007 case TargetOpcode::G_BUILD_VECTOR:
4008 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4009 return selectG_BUILD_VECTOR(I);
4010 case TargetOpcode::G_IMPLICIT_DEF:
4011 return selectG_IMPLICIT_DEF(I);
4012 case TargetOpcode::G_INSERT:
4013 return selectG_INSERT(I);
4014 case TargetOpcode::G_INTRINSIC:
4015 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4016 return selectG_INTRINSIC(I);
4017 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4018 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4019 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4020 case TargetOpcode::G_ICMP:
4021 case TargetOpcode::G_FCMP:
4022 if (selectG_ICMP_or_FCMP(I))
4023 return true;
4024 return selectImpl(I, *CoverageInfo);
4025 case TargetOpcode::G_LOAD:
4026 case TargetOpcode::G_ZEXTLOAD:
4027 case TargetOpcode::G_SEXTLOAD:
4028 case TargetOpcode::G_STORE:
4029 case TargetOpcode::G_ATOMIC_CMPXCHG:
4030 case TargetOpcode::G_ATOMICRMW_XCHG:
4031 case TargetOpcode::G_ATOMICRMW_ADD:
4032 case TargetOpcode::G_ATOMICRMW_SUB:
4033 case TargetOpcode::G_ATOMICRMW_AND:
4034 case TargetOpcode::G_ATOMICRMW_OR:
4035 case TargetOpcode::G_ATOMICRMW_XOR:
4036 case TargetOpcode::G_ATOMICRMW_MIN:
4037 case TargetOpcode::G_ATOMICRMW_MAX:
4038 case TargetOpcode::G_ATOMICRMW_UMIN:
4039 case TargetOpcode::G_ATOMICRMW_UMAX:
4040 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4041 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4042 case TargetOpcode::G_ATOMICRMW_FADD:
4043 case TargetOpcode::G_ATOMICRMW_FMIN:
4044 case TargetOpcode::G_ATOMICRMW_FMAX:
4045 return selectG_LOAD_STORE_ATOMICRMW(I);
4046 case TargetOpcode::G_SELECT:
4047 return selectG_SELECT(I);
4048 case TargetOpcode::G_TRUNC:
4049 return selectG_TRUNC(I);
4050 case TargetOpcode::G_SEXT:
4051 case TargetOpcode::G_ZEXT:
4052 case TargetOpcode::G_ANYEXT:
4053 case TargetOpcode::G_SEXT_INREG:
4054 // This is a workaround. For extension from type i1, `selectImpl()` uses
4055 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4056 // i1 can only be hold in a SGPR class.
4057 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4058 selectImpl(I, *CoverageInfo))
4059 return true;
4060 return selectG_SZA_EXT(I);
4061 case TargetOpcode::G_FPEXT:
4062 if (selectG_FPEXT(I))
4063 return true;
4064 return selectImpl(I, *CoverageInfo);
4065 case TargetOpcode::G_BRCOND:
4066 return selectG_BRCOND(I);
4067 case TargetOpcode::G_GLOBAL_VALUE:
4068 return selectG_GLOBAL_VALUE(I);
4069 case TargetOpcode::G_PTRMASK:
4070 return selectG_PTRMASK(I);
4071 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4072 return selectG_EXTRACT_VECTOR_ELT(I);
4073 case TargetOpcode::G_INSERT_VECTOR_ELT:
4074 return selectG_INSERT_VECTOR_ELT(I);
4075 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4076 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4077 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4078 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4079 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4082 assert(Intr && "not an image intrinsic with image pseudo");
4083 return selectImageIntrinsic(I, Intr);
4084 }
4085 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4086 return selectBVHIntrinsic(I);
4087 case AMDGPU::G_SBFX:
4088 case AMDGPU::G_UBFX:
4089 return selectG_SBFX_UBFX(I);
4090 case AMDGPU::G_SI_CALL:
4091 I.setDesc(TII.get(AMDGPU::SI_CALL));
4092 return true;
4093 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4094 return selectWaveAddress(I);
4095 case AMDGPU::G_STACKRESTORE:
4096 return selectStackRestore(I);
4097 case AMDGPU::G_PHI:
4098 return selectPHI(I);
4099 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4100 return selectCOPY_SCC_VCC(I);
4101 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4102 return selectCOPY_VCC_SCC(I);
4103 case AMDGPU::G_AMDGPU_READANYLANE:
4104 return selectReadAnyLane(I);
4105 case TargetOpcode::G_CONSTANT:
4106 case TargetOpcode::G_FCONSTANT:
4107 default:
4108 return selectImpl(I, *CoverageInfo);
4109 }
4110 return false;
4111}
4112
4114AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4115 return {{
4116 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4117 }};
4118
4119}
4120
4121std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4122 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4123 unsigned Mods = 0;
4124 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4125
4126 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4127 Src = MI->getOperand(1).getReg();
4128 Mods |= SISrcMods::NEG;
4129 MI = getDefIgnoringCopies(Src, *MRI);
4130 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4131 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4132 // denormal mode, but we're implicitly canonicalizing in a source operand.
4133 const ConstantFP *LHS =
4134 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4135 if (LHS && LHS->isZero()) {
4136 Mods |= SISrcMods::NEG;
4137 Src = MI->getOperand(2).getReg();
4138 }
4139 }
4140
4141 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4142 Src = MI->getOperand(1).getReg();
4143 Mods |= SISrcMods::ABS;
4144 }
4145
4146 if (OpSel)
4147 Mods |= SISrcMods::OP_SEL_0;
4148
4149 return std::pair(Src, Mods);
4150}
4151
4152Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4153 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4154 bool ForceVGPR) const {
4155 if ((Mods != 0 || ForceVGPR) &&
4156 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4157
4158 // If we looked through copies to find source modifiers on an SGPR operand,
4159 // we now have an SGPR register source. To avoid potentially violating the
4160 // constant bus restriction, we need to insert a copy to a VGPR.
4161 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4162 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4163 TII.get(AMDGPU::COPY), VGPRSrc)
4164 .addReg(Src);
4165 Src = VGPRSrc;
4166 }
4167
4168 return Src;
4169}
4170
4171///
4172/// This will select either an SGPR or VGPR operand and will save us from
4173/// having to write an extra tablegen pattern.
4175AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4176 return {{
4177 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4178 }};
4179}
4180
4182AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4183 Register Src;
4184 unsigned Mods;
4185 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4186
4187 return {{
4188 [=](MachineInstrBuilder &MIB) {
4189 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4190 },
4191 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4192 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4193 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4194 }};
4195}
4196
4198AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4199 Register Src;
4200 unsigned Mods;
4201 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4202 /*IsCanonicalizing=*/true,
4203 /*AllowAbs=*/false);
4204
4205 return {{
4206 [=](MachineInstrBuilder &MIB) {
4207 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4208 },
4209 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4210 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4211 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4212 }};
4213}
4214
4216AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4217 return {{
4218 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4219 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4220 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4221 }};
4222}
4223
4225AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4226 Register Src;
4227 unsigned Mods;
4228 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4229
4230 return {{
4231 [=](MachineInstrBuilder &MIB) {
4232 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4233 },
4234 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4235 }};
4236}
4237
4239AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4240 MachineOperand &Root) const {
4241 Register Src;
4242 unsigned Mods;
4243 std::tie(Src, Mods) =
4244 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4245
4246 return {{
4247 [=](MachineInstrBuilder &MIB) {
4248 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4249 },
4250 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4251 }};
4252}
4253
4255AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4256 Register Src;
4257 unsigned Mods;
4258 std::tie(Src, Mods) =
4259 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4260 /*AllowAbs=*/false);
4261
4262 return {{
4263 [=](MachineInstrBuilder &MIB) {
4264 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4265 },
4266 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4267 }};
4268}
4269
4271AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4272 Register Reg = Root.getReg();
4273 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4274 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4275 return {};
4276 return {{
4277 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4278 }};
4279}
4280
4281std::pair<Register, unsigned>
4282AMDGPUInstructionSelector::selectVOP3PModsImpl(
4283 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
4284 unsigned Mods = 0;
4285 MachineInstr *MI = MRI.getVRegDef(Src);
4286
4287 if (MI->getOpcode() == AMDGPU::G_FNEG &&
4288 // It's possible to see an f32 fneg here, but unlikely.
4289 // TODO: Treat f32 fneg as only high bit.
4290 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
4292 Src = MI->getOperand(1).getReg();
4293 MI = MRI.getVRegDef(Src);
4294 }
4295
4296 // TODO: Handle G_FSUB 0 as fneg
4297
4298 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4299 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
4300
4301 // Packed instructions do not have abs modifiers.
4302 Mods |= SISrcMods::OP_SEL_1;
4303
4304 return std::pair(Src, Mods);
4305}
4306
4308AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4310 = Root.getParent()->getParent()->getParent()->getRegInfo();
4311
4312 Register Src;
4313 unsigned Mods;
4314 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
4315
4316 return {{
4317 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4318 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4319 }};
4320}
4321
4323AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4325 = Root.getParent()->getParent()->getParent()->getRegInfo();
4326
4327 Register Src;
4328 unsigned Mods;
4329 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
4330
4331 return {{
4332 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4333 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4334 }};
4335}
4336
4338AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4339 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4340 // Value is in Imm operand as i1 sign extended to int64_t.
4341 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4342 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4343 "expected i1 value");
4344 unsigned Mods = SISrcMods::OP_SEL_1;
4345 if (Root.getImm() == -1)
4346 Mods ^= SISrcMods::NEG;
4347 return {{
4348 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4349 }};
4350}
4351
4353AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4354 MachineOperand &Root) const {
4355 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4356 "expected i1 value");
4357 unsigned Mods = SISrcMods::OP_SEL_1;
4358 if (Root.getImm() != 0)
4359 Mods |= SISrcMods::OP_SEL_0;
4360
4361 return {{
4362 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4363 }};
4364}
4365
4367 MachineInstr *InsertPt,
4369 const TargetRegisterClass *DstRegClass;
4370 switch (Elts.size()) {
4371 case 8:
4372 DstRegClass = &AMDGPU::VReg_256RegClass;
4373 break;
4374 case 4:
4375 DstRegClass = &AMDGPU::VReg_128RegClass;
4376 break;
4377 case 2:
4378 DstRegClass = &AMDGPU::VReg_64RegClass;
4379 break;
4380 default:
4381 llvm_unreachable("unhandled Reg sequence size");
4382 }
4383
4384 MachineIRBuilder B(*InsertPt);
4385 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4386 .addDef(MRI.createVirtualRegister(DstRegClass));
4387 for (unsigned i = 0; i < Elts.size(); ++i) {
4388 MIB.addReg(Elts[i]);
4390 }
4391 return MIB->getOperand(0).getReg();
4392}
4393
4394static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4396 MachineInstr *InsertPt,
4398 if (ModOpcode == TargetOpcode::G_FNEG) {
4399 Mods |= SISrcMods::NEG;
4400 // Check if all elements also have abs modifier
4401 SmallVector<Register, 8> NegAbsElts;
4402 for (auto El : Elts) {
4403 Register FabsSrc;
4404 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4405 break;
4406 NegAbsElts.push_back(FabsSrc);
4407 }
4408 if (Elts.size() != NegAbsElts.size()) {
4409 // Neg
4410 Src = buildRegSequence(Elts, InsertPt, MRI);
4411 } else {
4412 // Neg and Abs
4413 Mods |= SISrcMods::NEG_HI;
4414 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4415 }
4416 } else {
4417 assert(ModOpcode == TargetOpcode::G_FABS);
4418 // Abs
4419 Mods |= SISrcMods::NEG_HI;
4420 Src = buildRegSequence(Elts, InsertPt, MRI);
4421 }
4422}
4423
4425AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4426 Register Src = Root.getReg();
4427 unsigned Mods = SISrcMods::OP_SEL_1;
4429
4430 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4431 assert(BV->getNumSources() > 0);
4432 // Based on first element decide which mod we match, neg or abs
4433 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
4434 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4435 ? AMDGPU::G_FNEG
4436 : AMDGPU::G_FABS;
4437 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4438 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4439 if (ElF32->getOpcode() != ModOpcode)
4440 break;
4441 EltsF32.push_back(ElF32->getOperand(1).getReg());
4442 }
4443
4444 // All elements had ModOpcode modifier
4445 if (BV->getNumSources() == EltsF32.size()) {
4446 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4447 *MRI);
4448 }
4449 }
4450
4451 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4452 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4453}
4454
4456AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4457 Register Src = Root.getReg();
4458 unsigned Mods = SISrcMods::OP_SEL_1;
4459 SmallVector<Register, 8> EltsV2F16;
4460
4461 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4462 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4463 Register FNegSrc;
4464 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4465 break;
4466 EltsV2F16.push_back(FNegSrc);
4467 }
4468
4469 // All elements had ModOpcode modifier
4470 if (CV->getNumSources() == EltsV2F16.size()) {
4471 Mods |= SISrcMods::NEG;
4472 Mods |= SISrcMods::NEG_HI;
4473 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4474 }
4475 }
4476
4477 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4478 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4479}
4480
4482AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4483 Register Src = Root.getReg();
4484 unsigned Mods = SISrcMods::OP_SEL_1;
4485 SmallVector<Register, 8> EltsV2F16;
4486
4487 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4488 assert(CV->getNumSources() > 0);
4489 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4490 // Based on first element decide which mod we match, neg or abs
4491 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4492 ? AMDGPU::G_FNEG
4493 : AMDGPU::G_FABS;
4494
4495 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4496 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4497 if (ElV2F16->getOpcode() != ModOpcode)
4498 break;
4499 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4500 }
4501
4502 // All elements had ModOpcode modifier
4503 if (CV->getNumSources() == EltsV2F16.size()) {
4504 MachineIRBuilder B(*Root.getParent());
4505 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4506 *MRI);
4507 }
4508 }
4509
4510 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4511 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4512}
4513
4515AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4516 std::optional<FPValueAndVReg> FPValReg;
4517 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4518 if (TII.isInlineConstant(FPValReg->Value)) {
4519 return {{[=](MachineInstrBuilder &MIB) {
4520 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4521 }}};
4522 }
4523 // Non-inlineable splat floats should not fall-through for integer immediate
4524 // checks.
4525 return {};
4526 }
4527
4528 APInt ICst;
4529 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4530 if (TII.isInlineConstant(ICst)) {
4531 return {
4532 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4533 }
4534 }
4535
4536 return {};
4537}
4538
4540AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4541 Register Src =
4542 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4543 unsigned Key = 0;
4544
4545 Register ShiftSrc;
4546 std::optional<ValueAndVReg> ShiftAmt;
4547 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4548 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4549 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4550 Key = ShiftAmt->Value.getZExtValue() / 8;
4551 Src = ShiftSrc;
4552 }
4553
4554 return {{
4555 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4556 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4557 }};
4558}
4559
4561AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4562
4563 Register Src =
4564 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4565 unsigned Key = 0;
4566
4567 Register ShiftSrc;
4568 std::optional<ValueAndVReg> ShiftAmt;
4569 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4570 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4571 ShiftAmt->Value.getZExtValue() == 16) {
4572 Src = ShiftSrc;
4573 Key = 1;
4574 }
4575
4576 return {{
4577 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4578 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4579 }};
4580}
4581
4583AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4584 Register Src;
4585 unsigned Mods;
4586 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4587
4588 // FIXME: Handle op_sel
4589 return {{
4590 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4591 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4592 }};
4593}
4594
4596AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4597 Register Src;
4598 unsigned Mods;
4599 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4600 /*IsCanonicalizing=*/true,
4601 /*AllowAbs=*/false,
4602 /*OpSel=*/false);
4603
4604 return {{
4605 [=](MachineInstrBuilder &MIB) {
4606 MIB.addReg(
4607 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4608 },
4609 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4610 }};
4611}
4612
4614AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4615 Register Src;
4616 unsigned Mods;
4617 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4618 /*IsCanonicalizing=*/true,
4619 /*AllowAbs=*/false,
4620 /*OpSel=*/true);
4621
4622 return {{
4623 [=](MachineInstrBuilder &MIB) {
4624 MIB.addReg(
4625 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4626 },
4627 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4628 }};
4629}
4630
4631bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4632 Register &Base,
4633 Register *SOffset,
4634 int64_t *Offset) const {
4635 MachineInstr *MI = Root.getParent();
4636 MachineBasicBlock *MBB = MI->getParent();
4637
4638 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4639 // then we can select all ptr + 32-bit offsets.
4640 SmallVector<GEPInfo, 4> AddrInfo;
4641 getAddrModeInfo(*MI, *MRI, AddrInfo);
4642
4643 if (AddrInfo.empty())
4644 return false;
4645
4646 const GEPInfo &GEPI = AddrInfo[0];
4647 std::optional<int64_t> EncodedImm;
4648
4649 if (SOffset && Offset) {
4650 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4651 /*HasSOffset=*/true);
4652 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4653 AddrInfo.size() > 1) {
4654 const GEPInfo &GEPI2 = AddrInfo[1];
4655 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4656 if (Register OffsetReg =
4657 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4658 Base = GEPI2.SgprParts[0];
4659 *SOffset = OffsetReg;
4660 *Offset = *EncodedImm;
4661 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4662 return true;
4663
4664 // For unbuffered smem loads, it is illegal for the Immediate Offset
4665 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4666 // is negative. Handle the case where the Immediate Offset + SOffset
4667 // is negative.
4668 auto SKnown = KB->getKnownBits(*SOffset);
4669 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4670 return false;
4671
4672 return true;
4673 }
4674 }
4675 }
4676 return false;
4677 }
4678
4679 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4680 /*HasSOffset=*/false);
4681 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4682 Base = GEPI.SgprParts[0];
4683 *Offset = *EncodedImm;
4684 return true;
4685 }
4686
4687 // SGPR offset is unsigned.
4688 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4689 GEPI.Imm != 0) {
4690 // If we make it this far we have a load with an 32-bit immediate offset.
4691 // It is OK to select this using a sgpr offset, because we have already
4692 // failed trying to select this load into one of the _IMM variants since
4693 // the _IMM Patterns are considered before the _SGPR patterns.
4694 Base = GEPI.SgprParts[0];
4695 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4696 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4697 .addImm(GEPI.Imm);
4698 return true;
4699 }
4700
4701 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4702 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4703 Base = GEPI.SgprParts[0];
4704 *SOffset = OffsetReg;
4705 return true;
4706 }
4707 }
4708
4709 return false;
4710}
4711
4713AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4714 Register Base;
4715 int64_t Offset;
4716 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4717 return std::nullopt;
4718
4719 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4720 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4721}
4722
4724AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4725 SmallVector<GEPInfo, 4> AddrInfo;
4726 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4727
4728 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4729 return std::nullopt;
4730
4731 const GEPInfo &GEPInfo = AddrInfo[0];
4732 Register PtrReg = GEPInfo.SgprParts[0];
4733 std::optional<int64_t> EncodedImm =
4734 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4735 if (!EncodedImm)
4736 return std::nullopt;
4737
4738 return {{
4739 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4740 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4741 }};
4742}
4743
4745AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4746 Register Base, SOffset;
4747 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4748 return std::nullopt;
4749
4750 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4751 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4752}
4753
4755AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4756 Register Base, SOffset;
4757 int64_t Offset;
4758 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4759 return std::nullopt;
4760
4761 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4762 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4763 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4764}
4765
4766std::pair<Register, int>
4767AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4768 uint64_t FlatVariant) const {
4769 MachineInstr *MI = Root.getParent();
4770
4771 auto Default = std::pair(Root.getReg(), 0);
4772
4773 if (!STI.hasFlatInstOffsets())
4774 return Default;
4775
4776 Register PtrBase;
4777 int64_t ConstOffset;
4778 std::tie(PtrBase, ConstOffset) =
4779 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4780
4781 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4782 !isFlatScratchBaseLegal(Root.getReg())))
4783 return Default;
4784
4785 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4786 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4787 return Default;
4788
4789 return std::pair(PtrBase, ConstOffset);
4790}
4791
4793AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4794 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4795
4796 return {{
4797 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4798 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4799 }};
4800}
4801
4803AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4804 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4805
4806 return {{
4807 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4808 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4809 }};
4810}
4811
4813AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4814 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4815
4816 return {{
4817 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4818 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4819 }};
4820}
4821
4822// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4824AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4825 Register Addr = Root.getReg();
4826 Register PtrBase;
4827 int64_t ConstOffset;
4828 int64_t ImmOffset = 0;
4829
4830 // Match the immediate offset first, which canonically is moved as low as
4831 // possible.
4832 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4833
4834 if (ConstOffset != 0) {
4835 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4837 Addr = PtrBase;
4838 ImmOffset = ConstOffset;
4839 } else {
4840 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4841 if (isSGPR(PtrBaseDef->Reg)) {
4842 if (ConstOffset > 0) {
4843 // Offset is too large.
4844 //
4845 // saddr + large_offset -> saddr +
4846 // (voffset = large_offset & ~MaxOffset) +
4847 // (large_offset & MaxOffset);
4848 int64_t SplitImmOffset, RemainderOffset;
4849 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4851
4852 if (isUInt<32>(RemainderOffset)) {
4853 MachineInstr *MI = Root.getParent();
4854 MachineBasicBlock *MBB = MI->getParent();
4855 Register HighBits =
4856 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4857
4858 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4859 HighBits)
4860 .addImm(RemainderOffset);
4861
4862 return {{
4863 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4864 [=](MachineInstrBuilder &MIB) {
4865 MIB.addReg(HighBits);
4866 }, // voffset
4867 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4868 }};
4869 }
4870 }
4871
4872 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4873 // is 1 we would need to perform 1 or 2 extra moves for each half of
4874 // the constant and it is better to do a scalar add and then issue a
4875 // single VALU instruction to materialize zero. Otherwise it is less
4876 // instructions to perform VALU adds with immediates or inline literals.
4877 unsigned NumLiterals =
4878 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
4879 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
4880 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4881 return std::nullopt;
4882 }
4883 }
4884 }
4885
4886 // Match the variable offset.
4887 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4888 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4889 // Look through the SGPR->VGPR copy.
4890 Register SAddr =
4891 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4892
4893 if (isSGPR(SAddr)) {
4894 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4895
4896 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4897 // inserted later.
4898 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4899 return {{[=](MachineInstrBuilder &MIB) { // saddr
4900 MIB.addReg(SAddr);
4901 },
4902 [=](MachineInstrBuilder &MIB) { // voffset
4903 MIB.addReg(VOffset);
4904 },
4905 [=](MachineInstrBuilder &MIB) { // offset
4906 MIB.addImm(ImmOffset);
4907 }}};
4908 }
4909 }
4910 }
4911
4912 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4913 // drop this.
4914 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4915 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4916 return std::nullopt;
4917
4918 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4919 // moves required to copy a 64-bit SGPR to VGPR.
4920 MachineInstr *MI = Root.getParent();
4921 MachineBasicBlock *MBB = MI->getParent();
4922 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4923
4924 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4925 .addImm(0);
4926
4927 return {{
4928 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4929 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4930 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4931 }};
4932}
4933
4935AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4936 Register Addr = Root.getReg();
4937 Register PtrBase;
4938 int64_t ConstOffset;
4939 int64_t ImmOffset = 0;
4940
4941 // Match the immediate offset first, which canonically is moved as low as
4942 // possible.
4943 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4944
4945 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4948 Addr = PtrBase;
4949 ImmOffset = ConstOffset;
4950 }
4951
4952 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4953 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4954 int FI = AddrDef->MI->getOperand(1).getIndex();
4955 return {{
4956 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4957 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4958 }};
4959 }
4960
4961 Register SAddr = AddrDef->Reg;
4962
4963 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4964 Register LHS = AddrDef->MI->getOperand(1).getReg();
4965 Register RHS = AddrDef->MI->getOperand(2).getReg();
4966 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4967 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4968
4969 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4970 isSGPR(RHSDef->Reg)) {
4971 int FI = LHSDef->MI->getOperand(1).getIndex();
4972 MachineInstr &I = *Root.getParent();
4973 MachineBasicBlock *BB = I.getParent();
4974 const DebugLoc &DL = I.getDebugLoc();
4975 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4976
4977 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4978 .addFrameIndex(FI)
4979 .addReg(RHSDef->Reg)
4980 .setOperandDead(3); // Dead scc
4981 }
4982 }
4983
4984 if (!isSGPR(SAddr))
4985 return std::nullopt;
4986
4987 return {{
4988 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4989 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4990 }};
4991}
4992
4993// Check whether the flat scratch SVS swizzle bug affects this access.
4994bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4995 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4996 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4997 return false;
4998
4999 // The bug affects the swizzling of SVS accesses if there is any carry out
5000 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5001 // voffset to (soffset + inst_offset).
5002 auto VKnown = KB->getKnownBits(VAddr);
5003 auto SKnown = KnownBits::add(KB->getKnownBits(SAddr),
5004 KnownBits::makeConstant(APInt(32, ImmOffset)));
5005 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5006 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5007 return (VMax & 3) + (SMax & 3) >= 4;
5008}
5009
5011AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5012 Register Addr = Root.getReg();
5013 Register PtrBase;
5014 int64_t ConstOffset;
5015 int64_t ImmOffset = 0;
5016
5017 // Match the immediate offset first, which canonically is moved as low as
5018 // possible.
5019 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5020
5021 Register OrigAddr = Addr;
5022 if (ConstOffset != 0 &&
5023 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
5024 Addr = PtrBase;
5025 ImmOffset = ConstOffset;
5026 }
5027
5028 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5029 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5030 return std::nullopt;
5031
5032 Register RHS = AddrDef->MI->getOperand(2).getReg();
5033 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5034 return std::nullopt;
5035
5036 Register LHS = AddrDef->MI->getOperand(1).getReg();
5037 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5038
5039 if (OrigAddr != Addr) {
5040 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5041 return std::nullopt;
5042 } else {
5043 if (!isFlatScratchBaseLegalSV(OrigAddr))
5044 return std::nullopt;
5045 }
5046
5047 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5048 return std::nullopt;
5049
5050 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5051 int FI = LHSDef->MI->getOperand(1).getIndex();
5052 return {{
5053 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5054 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5055 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5056 }};
5057 }
5058
5059 if (!isSGPR(LHS))
5060 return std::nullopt;
5061
5062 return {{
5063 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5064 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5065 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5066 }};
5067}
5068
5070AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5071 MachineInstr *MI = Root.getParent();
5072 MachineBasicBlock *MBB = MI->getParent();
5075
5076 int64_t Offset = 0;
5077 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5079 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5080
5081 // TODO: Should this be inside the render function? The iterator seems to
5082 // move.
5083 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5084 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5085 HighBits)
5086 .addImm(Offset & ~MaxOffset);
5087
5088 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5089 MIB.addReg(Info->getScratchRSrcReg());
5090 },
5091 [=](MachineInstrBuilder &MIB) { // vaddr
5092 MIB.addReg(HighBits);
5093 },
5094 [=](MachineInstrBuilder &MIB) { // soffset
5095 // Use constant zero for soffset and rely on eliminateFrameIndex
5096 // to choose the appropriate frame register if need be.
5097 MIB.addImm(0);
5098 },
5099 [=](MachineInstrBuilder &MIB) { // offset
5100 MIB.addImm(Offset & MaxOffset);
5101 }}};
5102 }
5103
5104 assert(Offset == 0 || Offset == -1);
5105
5106 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5107 // offsets.
5108 std::optional<int> FI;
5109 Register VAddr = Root.getReg();
5110
5111 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5112 Register PtrBase;
5113 int64_t ConstOffset;
5114 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5115 if (ConstOffset != 0) {
5116 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5118 KB->signBitIsZero(PtrBase))) {
5119 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5120 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5121 FI = PtrBaseDef->getOperand(1).getIndex();
5122 else
5123 VAddr = PtrBase;
5124 Offset = ConstOffset;
5125 }
5126 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5127 FI = RootDef->getOperand(1).getIndex();
5128 }
5129
5130 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5131 MIB.addReg(Info->getScratchRSrcReg());
5132 },
5133 [=](MachineInstrBuilder &MIB) { // vaddr
5134 if (FI)
5135 MIB.addFrameIndex(*FI);
5136 else
5137 MIB.addReg(VAddr);
5138 },
5139 [=](MachineInstrBuilder &MIB) { // soffset
5140 // Use constant zero for soffset and rely on eliminateFrameIndex
5141 // to choose the appropriate frame register if need be.
5142 MIB.addImm(0);
5143 },
5144 [=](MachineInstrBuilder &MIB) { // offset
5145 MIB.addImm(Offset);
5146 }}};
5147}
5148
5149bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5150 int64_t Offset) const {
5151 if (!isUInt<16>(Offset))
5152 return false;
5153
5155 return true;
5156
5157 // On Southern Islands instruction with a negative base value and an offset
5158 // don't seem to work.
5159 return KB->signBitIsZero(Base);
5160}
5161
5162bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5163 int64_t Offset1,
5164 unsigned Size) const {
5165 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5166 return false;
5167 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5168 return false;
5169
5171 return true;
5172
5173 // On Southern Islands instruction with a negative base value and an offset
5174 // don't seem to work.
5175 return KB->signBitIsZero(Base);
5176}
5177
5178// Return whether the operation has NoUnsignedWrap property.
5180 return Addr->getOpcode() == TargetOpcode::G_OR ||
5181 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5182 Addr->getFlag(MachineInstr::NoUWrap));
5183}
5184
5185// Check that the base address of flat scratch load/store in the form of `base +
5186// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5187// requirement). We always treat the first operand as the base address here.
5188bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5189 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5190
5191 if (isNoUnsignedWrap(AddrMI))
5192 return true;
5193
5194 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5195 // values.
5196 if (STI.hasSignedScratchOffsets())
5197 return true;
5198
5199 Register LHS = AddrMI->getOperand(1).getReg();
5200 Register RHS = AddrMI->getOperand(2).getReg();
5201
5202 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5203 std::optional<ValueAndVReg> RhsValReg =
5205 // If the immediate offset is negative and within certain range, the base
5206 // address cannot also be negative. If the base is also negative, the sum
5207 // would be either negative or much larger than the valid range of scratch
5208 // memory a thread can access.
5209 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5210 RhsValReg->Value.getSExtValue() > -0x40000000)
5211 return true;
5212 }
5213
5214 return KB->signBitIsZero(LHS);
5215}
5216
5217// Check address value in SGPR/VGPR are legal for flat scratch in the form
5218// of: SGPR + VGPR.
5219bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5220 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5221
5222 if (isNoUnsignedWrap(AddrMI))
5223 return true;
5224
5225 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5226 // values.
5227 if (STI.hasSignedScratchOffsets())
5228 return true;
5229
5230 Register LHS = AddrMI->getOperand(1).getReg();
5231 Register RHS = AddrMI->getOperand(2).getReg();
5232 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5233}
5234
5235// Check address value in SGPR/VGPR are legal for flat scratch in the form
5236// of: SGPR + VGPR + Imm.
5237bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5238 Register Addr) const {
5239 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5240 // values.
5241 if (STI.hasSignedScratchOffsets())
5242 return true;
5243
5244 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5245 Register Base = AddrMI->getOperand(1).getReg();
5246 std::optional<DefinitionAndSourceRegister> BaseDef =
5248 std::optional<ValueAndVReg> RHSOffset =
5250 assert(RHSOffset);
5251
5252 // If the immediate offset is negative and within certain range, the base
5253 // address cannot also be negative. If the base is also negative, the sum
5254 // would be either negative or much larger than the valid range of scratch
5255 // memory a thread can access.
5256 if (isNoUnsignedWrap(BaseDef->MI) &&
5257 (isNoUnsignedWrap(AddrMI) ||
5258 (RHSOffset->Value.getSExtValue() < 0 &&
5259 RHSOffset->Value.getSExtValue() > -0x40000000)))
5260 return true;
5261
5262 Register LHS = BaseDef->MI->getOperand(1).getReg();
5263 Register RHS = BaseDef->MI->getOperand(2).getReg();
5264 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5265}
5266
5267bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5268 unsigned ShAmtBits) const {
5269 assert(MI.getOpcode() == TargetOpcode::G_AND);
5270
5271 std::optional<APInt> RHS =
5272 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5273 if (!RHS)
5274 return false;
5275
5276 if (RHS->countr_one() >= ShAmtBits)
5277 return true;
5278
5279 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
5280 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5281}
5282
5284AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5285 MachineOperand &Root) const {
5286 Register Reg = Root.getReg();
5288
5289 std::optional<DefinitionAndSourceRegister> Def =
5290 getDefSrcRegIgnoringCopies(Reg, *MRI);
5291 assert(Def && "this shouldn't be an optional result");
5292 Reg = Def->Reg;
5293
5294 if (Register WaveBase = getWaveAddress(Def->MI)) {
5295 return {{
5296 [=](MachineInstrBuilder &MIB) { // rsrc
5297 MIB.addReg(Info->getScratchRSrcReg());
5298 },
5299 [=](MachineInstrBuilder &MIB) { // soffset
5300 MIB.addReg(WaveBase);
5301 },
5302 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5303 }};
5304 }
5305
5306 int64_t Offset = 0;
5307
5308 // FIXME: Copy check is a hack
5310 if (mi_match(Reg, *MRI,
5311 m_GPtrAdd(m_Reg(BasePtr),
5313 if (!TII.isLegalMUBUFImmOffset(Offset))
5314 return {};
5315 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5316 Register WaveBase = getWaveAddress(BasePtrDef);
5317 if (!WaveBase)
5318 return {};
5319
5320 return {{
5321 [=](MachineInstrBuilder &MIB) { // rsrc
5322 MIB.addReg(Info->getScratchRSrcReg());
5323 },
5324 [=](MachineInstrBuilder &MIB) { // soffset
5325 MIB.addReg(WaveBase);
5326 },
5327 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5328 }};
5329 }
5330
5331 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5333 return {};
5334
5335 return {{
5336 [=](MachineInstrBuilder &MIB) { // rsrc
5337 MIB.addReg(Info->getScratchRSrcReg());
5338 },
5339 [=](MachineInstrBuilder &MIB) { // soffset
5340 MIB.addImm(0);
5341 },
5342 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5343 }};
5344}
5345
5346std::pair<Register, unsigned>
5347AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5348 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5349 int64_t ConstAddr = 0;
5350
5351 Register PtrBase;
5352 int64_t Offset;
5353 std::tie(PtrBase, Offset) =
5354 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5355
5356 if (Offset) {
5357 if (isDSOffsetLegal(PtrBase, Offset)) {
5358 // (add n0, c0)
5359 return std::pair(PtrBase, Offset);
5360 }
5361 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5362 // TODO
5363
5364
5365 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5366 // TODO
5367
5368 }
5369
5370 return std::pair(Root.getReg(), 0);
5371}
5372
5374AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5375 Register Reg;
5376 unsigned Offset;
5377 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5378 return {{
5379 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5380 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5381 }};
5382}
5383
5385AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5386 return selectDSReadWrite2(Root, 4);
5387}
5388
5390AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5391 return selectDSReadWrite2(Root, 8);
5392}
5393
5395AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5396 unsigned Size) const {
5397 Register Reg;
5398 unsigned Offset;
5399 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5400 return {{
5401 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5402 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5403 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5404 }};
5405}
5406
5407std::pair<Register, unsigned>
5408AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5409 unsigned Size) const {
5410 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5411 int64_t ConstAddr = 0;
5412
5413 Register PtrBase;
5414 int64_t Offset;
5415 std::tie(PtrBase, Offset) =
5416 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5417
5418 if (Offset) {
5419 int64_t OffsetValue0 = Offset;
5420 int64_t OffsetValue1 = Offset + Size;
5421 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5422 // (add n0, c0)
5423 return std::pair(PtrBase, OffsetValue0 / Size);
5424 }
5425 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5426 // TODO
5427
5428 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5429 // TODO
5430
5431 }
5432
5433 return std::pair(Root.getReg(), 0);
5434}
5435
5436/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5437/// the base value with the constant offset. There may be intervening copies
5438/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5439/// not match the pattern.
5440std::pair<Register, int64_t>
5441AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5442 Register Root, const MachineRegisterInfo &MRI) const {
5443 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5444 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5445 return {Root, 0};
5446
5447 MachineOperand &RHS = RootI->getOperand(2);
5448 std::optional<ValueAndVReg> MaybeOffset =
5450 if (!MaybeOffset)
5451 return {Root, 0};
5452 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5453}
5454
5456 MIB.addImm(0);
5457}
5458
5459/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5460/// BasePtr is not valid, a null base pointer will be used.
5462 uint32_t FormatLo, uint32_t FormatHi,
5463 Register BasePtr) {
5464 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5465 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5466 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5467 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5468
5469 B.buildInstr(AMDGPU::S_MOV_B32)
5470 .addDef(RSrc2)
5471 .addImm(FormatLo);
5472 B.buildInstr(AMDGPU::S_MOV_B32)
5473 .addDef(RSrc3)
5474 .addImm(FormatHi);
5475
5476 // Build the half of the subregister with the constants before building the
5477 // full 128-bit register. If we are building multiple resource descriptors,
5478 // this will allow CSEing of the 2-component register.
5479 B.buildInstr(AMDGPU::REG_SEQUENCE)
5480 .addDef(RSrcHi)
5481 .addReg(RSrc2)
5482 .addImm(AMDGPU::sub0)
5483 .addReg(RSrc3)
5484 .addImm(AMDGPU::sub1);
5485
5486 Register RSrcLo = BasePtr;
5487 if (!BasePtr) {
5488 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5489 B.buildInstr(AMDGPU::S_MOV_B64)
5490 .addDef(RSrcLo)
5491 .addImm(0);
5492 }
5493
5494 B.buildInstr(AMDGPU::REG_SEQUENCE)
5495 .addDef(RSrc)
5496 .addReg(RSrcLo)
5497 .addImm(AMDGPU::sub0_sub1)
5498 .addReg(RSrcHi)
5499 .addImm(AMDGPU::sub2_sub3);
5500
5501 return RSrc;
5502}
5503
5505 const SIInstrInfo &TII, Register BasePtr) {
5506 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5507
5508 // FIXME: Why are half the "default" bits ignored based on the addressing
5509 // mode?
5510 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5511}
5512
5514 const SIInstrInfo &TII, Register BasePtr) {
5515 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5516
5517 // FIXME: Why are half the "default" bits ignored based on the addressing
5518 // mode?
5519 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5520}
5521
5522AMDGPUInstructionSelector::MUBUFAddressData
5523AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5524 MUBUFAddressData Data;
5525 Data.N0 = Src;
5526
5527 Register PtrBase;
5528 int64_t Offset;
5529
5530 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5531 if (isUInt<32>(Offset)) {
5532 Data.N0 = PtrBase;
5533 Data.Offset = Offset;
5534 }
5535
5536 if (MachineInstr *InputAdd
5537 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5538 Data.N2 = InputAdd->getOperand(1).getReg();
5539 Data.N3 = InputAdd->getOperand(2).getReg();
5540
5541 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5542 // FIXME: Don't know this was defined by operand 0
5543 //
5544 // TODO: Remove this when we have copy folding optimizations after
5545 // RegBankSelect.
5546 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5547 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5548 }
5549
5550 return Data;
5551}
5552
5553/// Return if the addr64 mubuf mode should be used for the given address.
5554bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5555 // (ptr_add N2, N3) -> addr64, or
5556 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5557 if (Addr.N2)
5558 return true;
5559
5560 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5561 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5562}
5563
5564/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5565/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5566/// component.
5567void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5568 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5569 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5570 return;
5571
5572 // Illegal offset, store it in soffset.
5573 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5574 B.buildInstr(AMDGPU::S_MOV_B32)
5575 .addDef(SOffset)
5576 .addImm(ImmOffset);
5577 ImmOffset = 0;
5578}
5579
5580bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5581 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5582 Register &SOffset, int64_t &Offset) const {
5583 // FIXME: Predicates should stop this from reaching here.
5584 // addr64 bit was removed for volcanic islands.
5585 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5586 return false;
5587
5588 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5589 if (!shouldUseAddr64(AddrData))
5590 return false;
5591
5592 Register N0 = AddrData.N0;
5593 Register N2 = AddrData.N2;
5594 Register N3 = AddrData.N3;
5595 Offset = AddrData.Offset;
5596
5597 // Base pointer for the SRD.
5598 Register SRDPtr;
5599
5600 if (N2) {
5601 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5602 assert(N3);
5603 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5604 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5605 // addr64, and construct the default resource from a 0 address.
5606 VAddr = N0;
5607 } else {
5608 SRDPtr = N3;
5609 VAddr = N2;
5610 }
5611 } else {
5612 // N2 is not divergent.
5613 SRDPtr = N2;
5614 VAddr = N3;
5615 }
5616 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5617 // Use the default null pointer in the resource
5618 VAddr = N0;
5619 } else {
5620 // N0 -> offset, or
5621 // (N0 + C1) -> offset
5622 SRDPtr = N0;
5623 }
5624
5625 MachineIRBuilder B(*Root.getParent());
5626 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5627 splitIllegalMUBUFOffset(B, SOffset, Offset);
5628 return true;
5629}
5630
5631bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5632 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5633 int64_t &Offset) const {
5634
5635 // FIXME: Pattern should not reach here.
5636 if (STI.useFlatForGlobal())
5637 return false;
5638
5639 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5640 if (shouldUseAddr64(AddrData))
5641 return false;
5642
5643 // N0 -> offset, or
5644 // (N0 + C1) -> offset
5645 Register SRDPtr = AddrData.N0;
5646 Offset = AddrData.Offset;
5647
5648 // TODO: Look through extensions for 32-bit soffset.
5649 MachineIRBuilder B(*Root.getParent());
5650
5651 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5652 splitIllegalMUBUFOffset(B, SOffset, Offset);
5653 return true;
5654}
5655
5657AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5658 Register VAddr;
5659 Register RSrcReg;
5660 Register SOffset;
5661 int64_t Offset = 0;
5662
5663 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5664 return {};
5665
5666 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5667 // pattern.
5668 return {{
5669 [=](MachineInstrBuilder &MIB) { // rsrc
5670 MIB.addReg(RSrcReg);
5671 },
5672 [=](MachineInstrBuilder &MIB) { // vaddr
5673 MIB.addReg(VAddr);
5674 },
5675 [=](MachineInstrBuilder &MIB) { // soffset
5676 if (SOffset)
5677 MIB.addReg(SOffset);
5678 else if (STI.hasRestrictedSOffset())
5679 MIB.addReg(AMDGPU::SGPR_NULL);
5680 else
5681 MIB.addImm(0);
5682 },
5683 [=](MachineInstrBuilder &MIB) { // offset
5684 MIB.addImm(Offset);
5685 },
5686 addZeroImm, // cpol
5687 addZeroImm, // tfe
5688 addZeroImm // swz
5689 }};
5690}
5691
5693AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5694 Register RSrcReg;
5695 Register SOffset;
5696 int64_t Offset = 0;
5697
5698 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5699 return {};
5700
5701 return {{
5702 [=](MachineInstrBuilder &MIB) { // rsrc
5703 MIB.addReg(RSrcReg);
5704 },
5705 [=](MachineInstrBuilder &MIB) { // soffset
5706 if (SOffset)
5707 MIB.addReg(SOffset);
5708 else if (STI.hasRestrictedSOffset())
5709 MIB.addReg(AMDGPU::SGPR_NULL);
5710 else
5711 MIB.addImm(0);
5712 },
5713 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5714 addZeroImm, // cpol
5715 addZeroImm, // tfe
5716 addZeroImm, // swz
5717 }};
5718}
5719
5721AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5722
5723 Register SOffset = Root.getReg();
5724
5725 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5726 SOffset = AMDGPU::SGPR_NULL;
5727
5728 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5729}
5730
5731/// Get an immediate that must be 32-bits, and treated as zero extended.
5732static std::optional<uint64_t>
5734 // getIConstantVRegVal sexts any values, so see if that matters.
5735 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5736 if (!OffsetVal || !isInt<32>(*OffsetVal))
5737 return std::nullopt;
5738 return Lo_32(*OffsetVal);
5739}
5740
5742AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5743 std::optional<uint64_t> OffsetVal =
5744 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
5745 if (!OffsetVal)
5746 return {};
5747
5748 std::optional<int64_t> EncodedImm =
5749 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5750 if (!EncodedImm)
5751 return {};
5752
5753 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5754}
5755
5757AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5759
5760 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5761 if (!OffsetVal)
5762 return {};
5763
5764 std::optional<int64_t> EncodedImm =
5766 if (!EncodedImm)
5767 return {};
5768
5769 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5770}
5771
5773AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5774 // Match the (soffset + offset) pair as a 32-bit register base and
5775 // an immediate offset.
5776 Register SOffset;
5777 unsigned Offset;
5778 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5779 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5780 if (!SOffset)
5781 return std::nullopt;
5782
5783 std::optional<int64_t> EncodedOffset =
5784 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5785 if (!EncodedOffset)
5786 return std::nullopt;
5787
5788 assert(MRI->getType(SOffset) == LLT::scalar(32));
5789 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5790 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5791}
5792
5793std::pair<Register, unsigned>
5794AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5795 bool &Matched) const {
5796 Matched = false;
5797
5798 Register Src;
5799 unsigned Mods;
5800 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5801
5802 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
5803 assert(MRI->getType(Src) == LLT::scalar(16));
5804
5805 // Only change Src if src modifier could be gained. In such cases new Src
5806 // could be sgpr but this does not violate constant bus restriction for
5807 // instruction that is being selected.
5808 Src = stripBitCast(Src, *MRI);
5809
5810 const auto CheckAbsNeg = [&]() {
5811 // Be careful about folding modifiers if we already have an abs. fneg is
5812 // applied last, so we don't want to apply an earlier fneg.
5813 if ((Mods & SISrcMods::ABS) == 0) {
5814 unsigned ModsTmp;
5815 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5816
5817 if ((ModsTmp & SISrcMods::NEG) != 0)
5818 Mods ^= SISrcMods::NEG;
5819
5820 if ((ModsTmp & SISrcMods::ABS) != 0)
5821 Mods |= SISrcMods::ABS;
5822 }
5823 };
5824
5825 CheckAbsNeg();
5826
5827 // op_sel/op_sel_hi decide the source type and source.
5828 // If the source's op_sel_hi is set, it indicates to do a conversion from
5829 // fp16. If the sources's op_sel is set, it picks the high half of the
5830 // source register.
5831
5832 Mods |= SISrcMods::OP_SEL_1;
5833
5834 if (isExtractHiElt(*MRI, Src, Src)) {
5835 Mods |= SISrcMods::OP_SEL_0;
5836 CheckAbsNeg();
5837 }
5838
5839 Matched = true;
5840 }
5841
5842 return {Src, Mods};
5843}
5844
5846AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5847 MachineOperand &Root) const {
5848 Register Src;
5849 unsigned Mods;
5850 bool Matched;
5851 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5852 if (!Matched)
5853 return {};
5854
5855 return {{
5856 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5857 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5858 }};
5859}
5860
5862AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5863 Register Src;
5864 unsigned Mods;
5865 bool Matched;
5866 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5867
5868 return {{
5869 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5870 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5871 }};
5872}
5873
5874bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5875 MachineInstr &I, Intrinsic::ID IntrID) const {
5876 MachineBasicBlock *MBB = I.getParent();
5877 const DebugLoc &DL = I.getDebugLoc();
5878 Register CCReg = I.getOperand(0).getReg();
5879
5880 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5881 .addImm(I.getOperand(2).getImm());
5882
5883 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5884
5885 I.eraseFromParent();
5886 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5887 *MRI);
5888}
5889
5890bool AMDGPUInstructionSelector::selectSGetBarrierState(
5891 MachineInstr &I, Intrinsic::ID IntrID) const {
5892 MachineBasicBlock *MBB = I.getParent();
5893 const DebugLoc &DL = I.getDebugLoc();
5894 MachineOperand BarOp = I.getOperand(2);
5895 std::optional<int64_t> BarValImm =
5896 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5897
5898 if (!BarValImm) {
5899 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5900 .addReg(BarOp.getReg());
5901 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5902 }
5904 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5905 : AMDGPU::S_GET_BARRIER_STATE_M0;
5906 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5907
5908 auto DstReg = I.getOperand(0).getReg();
5909 const TargetRegisterClass *DstRC =
5910 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5911 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5912 return false;
5913 MIB.addDef(DstReg);
5914 if (BarValImm) {
5915 MIB.addImm(*BarValImm);
5916 }
5917 I.eraseFromParent();
5918 return true;
5919}
5920
5921unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5922 if (HasInlineConst) {
5923 switch (IntrID) {
5924 default:
5925 llvm_unreachable("not a named barrier op");
5926 case Intrinsic::amdgcn_s_barrier_join:
5927 return AMDGPU::S_BARRIER_JOIN_IMM;
5928 case Intrinsic::amdgcn_s_get_named_barrier_state:
5929 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5930 };
5931 } else {
5932 switch (IntrID) {
5933 default:
5934 llvm_unreachable("not a named barrier op");
5935 case Intrinsic::amdgcn_s_barrier_join:
5936 return AMDGPU::S_BARRIER_JOIN_M0;
5937 case Intrinsic::amdgcn_s_get_named_barrier_state:
5938 return AMDGPU::S_GET_BARRIER_STATE_M0;
5939 };
5940 }
5941}
5942
5943bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5944 MachineInstr &I, Intrinsic::ID IntrID) const {
5945 MachineBasicBlock *MBB = I.getParent();
5946 const DebugLoc &DL = I.getDebugLoc();
5947 MachineOperand BarOp = I.getOperand(1);
5948 MachineOperand CntOp = I.getOperand(2);
5949
5950 // BarID = (BarOp >> 4) & 0x3F
5951 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5952 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5953 .add(BarOp)
5954 .addImm(4u)
5955 .setOperandDead(3); // Dead scc
5956
5957 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5958 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5959 .addReg(TmpReg0)
5960 .addImm(0x3F)
5961 .setOperandDead(3); // Dead scc
5962
5963 // MO = ((CntOp & 0x3F) << shAmt) | BarID
5964 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5965 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
5966 .add(CntOp)
5967 .addImm(0x3F)
5968 .setOperandDead(3); // Dead scc
5969
5970 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5971 constexpr unsigned ShAmt = 16;
5972 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
5973 .addReg(TmpReg2)
5974 .addImm(ShAmt)
5975 .setOperandDead(3); // Dead scc
5976
5977 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5978 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
5979 .addReg(TmpReg1)
5980 .addReg(TmpReg3)
5981 .setOperandDead(3); // Dead scc;
5982
5983 auto CopyMIB =
5984 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
5985 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5986
5987 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5988 ? AMDGPU::S_BARRIER_INIT_M0
5989 : AMDGPU::S_BARRIER_SIGNAL_M0;
5991 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5992
5993 I.eraseFromParent();
5994 return true;
5995}
5996
5997bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5998 MachineInstr &I, Intrinsic::ID IntrID) const {
5999 MachineBasicBlock *MBB = I.getParent();
6000 const DebugLoc &DL = I.getDebugLoc();
6001 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6002 ? I.getOperand(2)
6003 : I.getOperand(1);
6004 std::optional<int64_t> BarValImm =
6005 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6006
6007 if (!BarValImm) {
6008 // BarID = (BarOp >> 4) & 0x3F
6009 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6010 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6011 .addReg(BarOp.getReg())
6012 .addImm(4u)
6013 .setOperandDead(3); // Dead scc;
6014
6015 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6016 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6017 .addReg(TmpReg0)
6018 .addImm(0x3F)
6019 .setOperandDead(3); // Dead scc;
6020
6021 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6022 .addReg(TmpReg1);
6023 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6024 }
6025
6027 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6028 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6029
6030 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6031 auto DstReg = I.getOperand(0).getReg();
6032 const TargetRegisterClass *DstRC =
6033 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6034 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6035 return false;
6036 MIB.addDef(DstReg);
6037 }
6038
6039 if (BarValImm) {
6040 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6041 MIB.addImm(BarId);
6042 }
6043
6044 I.eraseFromParent();
6045 return true;
6046}
6047
6048void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6049 const MachineInstr &MI,
6050 int OpIdx) const {
6051 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6052 "Expected G_CONSTANT");
6053 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6054}
6055
6056void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6057 const MachineInstr &MI,
6058 int OpIdx) const {
6059 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6060 "Expected G_CONSTANT");
6061 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6062}
6063
6064void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6065 const MachineInstr &MI,
6066 int OpIdx) const {
6067 const MachineOperand &Op = MI.getOperand(1);
6068 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6069 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6070}
6071
6072void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6073 const MachineInstr &MI,
6074 int OpIdx) const {
6075 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6076 "Expected G_CONSTANT");
6077 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6078}
6079
6080/// This only really exists to satisfy DAG type checking machinery, so is a
6081/// no-op here.
6082void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6083 const MachineInstr &MI,
6084 int OpIdx) const {
6085 const MachineOperand &Op = MI.getOperand(OpIdx);
6086 int64_t Imm;
6087 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6088 MIB.addImm(Imm);
6089 else
6090 MIB.addImm(Op.getImm());
6091}
6092
6093void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6094 const MachineInstr &MI,
6095 int OpIdx) const {
6096 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6097}
6098
6099void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6100 const MachineInstr &MI,
6101 int OpIdx) const {
6102 assert(OpIdx >= 0 && "expected to match an immediate operand");
6103 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6104}
6105
6106void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6107 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6108 assert(OpIdx >= 0 && "expected to match an immediate operand");
6109 MIB.addImm(
6110 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6111}
6112
6113void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6114 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6115 assert(OpIdx >= 0 && "expected to match an immediate operand");
6116 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6118 : (int64_t)SISrcMods::DST_OP_SEL);
6119}
6120
6121void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6122 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6123 assert(OpIdx >= 0 && "expected to match an immediate operand");
6124 MIB.addImm(
6125 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6126}
6127
6128void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6129 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6130 assert(OpIdx >= 0 && "expected to match an immediate operand");
6131 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6132 ? (int64_t)(SISrcMods::OP_SEL_0)
6133 : 0);
6134}
6135
6136void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6137 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6138 assert(OpIdx >= 0 && "expected to match an immediate operand");
6139 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6140 : 0);
6141}
6142
6143void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6144 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6145 assert(OpIdx >= 0 && "expected to match an immediate operand");
6146 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6147 : 0);
6148}
6149
6150void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6151 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6152 assert(OpIdx >= 0 && "expected to match an immediate operand");
6153 MIB.addImm(
6154 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6155}
6156
6157void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6158 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6159 assert(OpIdx >= 0 && "expected to match an immediate operand");
6160 MIB.addImm(
6161 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6162}
6163
6164void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6165 const MachineInstr &MI,
6166 int OpIdx) const {
6167 assert(OpIdx >= 0 && "expected to match an immediate operand");
6168 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6171}
6172
6173void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6174 const MachineInstr &MI,
6175 int OpIdx) const {
6176 assert(OpIdx >= 0 && "expected to match an immediate operand");
6177 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6180 MIB.addImm(Swizzle);
6181}
6182
6183void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6184 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6185 assert(OpIdx >= 0 && "expected to match an immediate operand");
6186 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6189 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6190}
6191
6192void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6193 const MachineInstr &MI,
6194 int OpIdx) const {
6195 MIB.addFrameIndex(MI.getOperand(1).getIndex());
6196}
6197
6198void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6199 const MachineInstr &MI,
6200 int OpIdx) const {
6201 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6202 int ExpVal = APF.getExactLog2Abs();
6203 assert(ExpVal != INT_MIN);
6204 MIB.addImm(ExpVal);
6205}
6206
6207void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6208 const MachineInstr &MI,
6209 int OpIdx) const {
6210 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6211 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6212 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6213 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6214 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6215}
6216
6217/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6218void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6219 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6220 unsigned Val = MI.getOperand(OpIdx).getImm();
6221 unsigned New = 0;
6222 if (Val & 0x1)
6224 if (Val & 0x2)
6226 MIB.addImm(New);
6227}
6228
6229bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6230 return TII.isInlineConstant(Imm);
6231}
6232
6233bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6234 return TII.isInlineConstant(Imm);
6235}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:690
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:688
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:687
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:685
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:686
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:675
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasBitOp3Insts() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:277
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:270
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1073
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:349
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:580
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:501
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
MCRegister getExec() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:910
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:645
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:439
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
DWARFExpression::Operation Op
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:467
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:493
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.