LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include <optional>
59
60using namespace llvm;
61
62#define DEBUG_TYPE "x86tti"
63
64//===----------------------------------------------------------------------===//
65//
66// X86 cost model.
67//
68//===----------------------------------------------------------------------===//
69
70// Helper struct to store/access costs for each cost kind.
71// TODO: Move this to allow other targets to use it?
73 unsigned RecipThroughputCost = ~0U;
74 unsigned LatencyCost = ~0U;
75 unsigned CodeSizeCost = ~0U;
76 unsigned SizeAndLatencyCost = ~0U;
77
78 std::optional<unsigned>
80 unsigned Cost = ~0U;
81 switch (Kind) {
84 break;
87 break;
90 break;
93 break;
94 }
95 if (Cost == ~0U)
96 return std::nullopt;
97 return Cost;
98 }
99};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 if (!Vector && ST->hasEGPR())
172 return 32;
173 return 16;
174 }
175 return 8;
176}
177
179 if (!ST->hasCF())
180 return false;
181 if (!Ty)
182 return true;
183 // Conditional faulting is supported by CFCMOV, which only accepts
184 // 16/32/64-bit operands.
185 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
186 // profitable.
187 auto *VTy = dyn_cast<FixedVectorType>(Ty);
188 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
189 return false;
190 auto *ScalarTy = Ty->getScalarType();
191 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
192 default:
193 return false;
194 case 16:
195 case 32:
196 case 64:
197 return true;
198 }
199}
200
203 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
204 switch (K) {
206 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
208 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
209 return TypeSize::getFixed(512);
210 if (ST->hasAVX() && PreferVectorWidth >= 256)
211 return TypeSize::getFixed(256);
212 if (ST->hasSSE1() && PreferVectorWidth >= 128)
213 return TypeSize::getFixed(128);
214 return TypeSize::getFixed(0);
216 return TypeSize::getScalable(0);
217 }
218
219 llvm_unreachable("Unsupported register kind");
220}
221
224 .getFixedValue();
225}
226
228 // If the loop will not be vectorized, don't interleave the loop.
229 // Let regular unroll to unroll the loop, which saves the overflow
230 // check and memory check cost.
231 if (VF.isScalar())
232 return 1;
233
234 if (ST->isAtom())
235 return 1;
236
237 // Sandybridge and Haswell have multiple execution ports and pipelined
238 // vector units.
239 if (ST->hasAVX())
240 return 4;
241
242 return 2;
243}
244
246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
249 const Instruction *CxtI) {
250
251 // vXi8 multiplications are always promoted to vXi16.
252 // Sub-128-bit types can be extended/packed more efficiently.
253 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
254 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
255 Type *WideVecTy =
256 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
257 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
259 CostKind) +
260 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
262 CostKind) +
263 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
264 }
265
266 // Legalize the type.
267 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
268
269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
270 assert(ISD && "Invalid opcode");
271
272 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
273 (LT.second.getScalarType() == MVT::i32 ||
274 LT.second.getScalarType() == MVT::i64)) {
275 // Check if the operands can be represented as a smaller datatype.
276 bool Op1Signed = false, Op2Signed = false;
277 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
278 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
279 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
280 bool SignedMode = Op1Signed || Op2Signed;
281
282 // If both vXi32 are representable as i15 and at least one is constant,
283 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
284 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
285 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
286 LT.second.getScalarType() == MVT::i32) {
287 bool Op1Constant =
288 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
289 bool Op2Constant =
290 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
291 bool Op1Sext = isa<SExtInst>(Args[0]) &&
292 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
293 bool Op2Sext = isa<SExtInst>(Args[1]) &&
294 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
295
296 bool IsZeroExtended = !Op1Signed || !Op2Signed;
297 bool IsConstant = Op1Constant || Op2Constant;
298 bool IsSext = Op1Sext || Op2Sext;
299 if (IsConstant || IsZeroExtended || IsSext)
300 LT.second =
301 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
302 }
303
304 // Check if the vXi32 operands can be shrunk into a smaller datatype.
305 // This should match the codegen from reduceVMULWidth.
306 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
307 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
308 if (OpMinSize <= 7)
309 return LT.first * 3; // pmullw/sext
310 if (!SignedMode && OpMinSize <= 8)
311 return LT.first * 3; // pmullw/zext
312 if (OpMinSize <= 15)
313 return LT.first * 5; // pmullw/pmulhw/pshuf
314 if (!SignedMode && OpMinSize <= 16)
315 return LT.first * 5; // pmullw/pmulhw/pshuf
316 }
317
318 // If both vXi64 are representable as (unsigned) i32, then we can perform
319 // the multiple with a single PMULUDQ instruction.
320 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
321 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
322 ISD = X86ISD::PMULUDQ;
323 }
324
325 // Vector multiply by pow2 will be simplified to shifts.
326 // Vector multiply by -pow2 will be simplified to shifts/negates.
327 if (ISD == ISD::MUL && Op2Info.isConstant() &&
328 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
330 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
331 Op1Info.getNoProps(), Op2Info.getNoProps());
332 if (Op2Info.isNegatedPowerOf2())
333 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
334 return Cost;
335 }
336
337 // On X86, vector signed division by constants power-of-two are
338 // normally expanded to the sequence SRA + SRL + ADD + SRA.
339 // The OperandValue properties may not be the same as that of the previous
340 // operation; conservatively assume OP_None.
341 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
342 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
344 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
347 Op1Info.getNoProps(), Op2Info.getNoProps());
348 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
349 Op1Info.getNoProps(), Op2Info.getNoProps());
350
351 if (ISD == ISD::SREM) {
352 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
353 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
354 Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
356 Op2Info.getNoProps());
357 }
358
359 return Cost;
360 }
361
362 // Vector unsigned division/remainder will be simplified to shifts/masks.
363 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
364 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
365 if (ISD == ISD::UDIV)
366 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
367 Op1Info.getNoProps(), Op2Info.getNoProps());
368 // UREM
369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
370 Op1Info.getNoProps(), Op2Info.getNoProps());
371 }
372
373 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
374 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
375 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 };
384
385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
386 if (const auto *Entry =
387 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
388 if (auto KindCost = Entry->Cost[CostKind])
389 return LT.first * *KindCost;
390
391 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
392 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
393 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
394 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
395 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
396 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
397 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
398 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
399 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
400 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
401
402 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
403 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
406 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 };
409
410 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
411 if (const auto *Entry =
412 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
413 if (auto KindCost = Entry->Cost[CostKind])
414 return LT.first * *KindCost;
415
416 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
417 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
418 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
419 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
420
421 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
422 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
423 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
424
425 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
426 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
427 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
428 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
429 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
430 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
431
432 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
433 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
434 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
435 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
436 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
437 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
438 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
439
440 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
441 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
442 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
443 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
444 };
445
446 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
447 if (const auto *Entry =
448 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
449 if (auto KindCost = Entry->Cost[CostKind])
450 return LT.first * *KindCost;
451
452 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
453 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
454 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
455 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
456 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
457 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
458 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
459
460 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
461 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
462 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
463 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
464 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
465 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
466
467 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
468 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
469 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
470 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
471 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
472 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
473
474 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
475 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
476 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
477 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
478 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
479 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
480
481 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
482 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
483 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
484 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
485 };
486
487 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
488 if (const auto *Entry =
489 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry AVXUniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
498 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
499 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
500
501 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
502 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
503 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
504 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
505 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
506 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
507
508 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
509 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
510 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
511 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
512 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
513 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
514
515 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
516 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
517 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
518 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
519 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
521
522 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
523 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
524 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
526 };
527
528 // XOP has faster vXi8 shifts.
529 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
530 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
531 if (const auto *Entry =
532 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
533 if (auto KindCost = Entry->Cost[CostKind])
534 return LT.first * *KindCost;
535
536 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
537 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
538 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
539 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
540
541 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
542 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
543 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
544
545 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
546 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
547 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
548
549 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
550 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
551 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
552
553 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
554 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
555 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
556 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
557 };
558
559 // XOP has faster vXi8 shifts.
560 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
561 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
562 if (const auto *Entry =
563 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
564 if (auto KindCost = Entry->Cost[CostKind])
565 return LT.first * *KindCost;
566
567 static const CostKindTblEntry AVX512BWConstCostTable[] = {
568 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
571 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
572
573 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
574 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
575 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
576 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
577 };
578
579 if (Op2Info.isConstant() && ST->hasBWI())
580 if (const auto *Entry =
581 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
582 if (auto KindCost = Entry->Cost[CostKind])
583 return LT.first * *KindCost;
584
585 static const CostKindTblEntry AVX512ConstCostTable[] = {
586 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
587 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
588 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
589 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
590
591 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
592 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
593 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
594 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
595
596 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
597 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
598 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
599 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
600 };
601
602 if (Op2Info.isConstant() && ST->hasAVX512())
603 if (const auto *Entry =
604 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
605 if (auto KindCost = Entry->Cost[CostKind])
606 return LT.first * *KindCost;
607
608 static const CostKindTblEntry AVX2ConstCostTable[] = {
609 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
610 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
611 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
612 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613
614 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
615 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
616 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
617 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
618
619 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
620 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
621 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
622 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
623 };
624
625 if (Op2Info.isConstant() && ST->hasAVX2())
626 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
627 if (auto KindCost = Entry->Cost[CostKind])
628 return LT.first * *KindCost;
629
630 static const CostKindTblEntry AVXConstCostTable[] = {
631 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
632 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
633 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
634 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
635
636 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
637 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
638 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
639 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
640
641 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
642 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
643 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
644 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
645 };
646
647 if (Op2Info.isConstant() && ST->hasAVX())
648 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
649 if (auto KindCost = Entry->Cost[CostKind])
650 return LT.first * *KindCost;
651
652 static const CostKindTblEntry SSE41ConstCostTable[] = {
653 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
654 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
655 };
656
657 if (Op2Info.isConstant() && ST->hasSSE41())
658 if (const auto *Entry =
659 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
660 if (auto KindCost = Entry->Cost[CostKind])
661 return LT.first * *KindCost;
662
663 static const CostKindTblEntry SSE2ConstCostTable[] = {
664 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
665 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
666 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
667 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
668
669 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
670 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
671 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
672 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
673
674 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
675 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
676 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
677 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
678 };
679
680 if (Op2Info.isConstant() && ST->hasSSE2())
681 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
682 if (auto KindCost = Entry->Cost[CostKind])
683 return LT.first * *KindCost;
684
685 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
686 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
687 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
688 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
689 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
690 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
691 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
692 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
693 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
694 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
695
696 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
697 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
698 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
699 };
700
701 if (ST->hasBWI() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX512UniformCostTable[] = {
708 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
709 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
710 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
711
712 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
713 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
714 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
715
716 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
717 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
718 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
719 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
720 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
721 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
722 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
723 };
724
725 if (ST->hasAVX512() && Op2Info.isUniform())
726 if (const auto *Entry =
727 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
728 if (auto KindCost = Entry->Cost[CostKind])
729 return LT.first * *KindCost;
730
731 static const CostKindTblEntry AVX2UniformCostTable[] = {
732 // Uniform splats are cheaper for the following instructions.
733 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
734 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
735 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
736 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
737 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
738 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
739
740 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
741 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
742 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
743 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
744 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
745 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
746
747 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
748 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
749 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
750 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
751 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
752 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
753
754 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
755 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
756 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
757 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
758 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
759 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
760 };
761
762 if (ST->hasAVX2() && Op2Info.isUniform())
763 if (const auto *Entry =
764 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
765 if (auto KindCost = Entry->Cost[CostKind])
766 return LT.first * *KindCost;
767
768 static const CostKindTblEntry AVXUniformCostTable[] = {
769 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
770 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
771 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
772 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
773 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
774 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
775
776 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
777 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
778 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
779 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
780 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
781 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
782
783 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
784 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
785 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
786 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
787 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
788 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
789
790 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
791 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
792 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
793 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
794 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
795 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
796 };
797
798 // XOP has faster vXi8 shifts.
799 if (ST->hasAVX() && Op2Info.isUniform() &&
800 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
801 if (const auto *Entry =
802 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
803 if (auto KindCost = Entry->Cost[CostKind])
804 return LT.first * *KindCost;
805
806 static const CostKindTblEntry SSE2UniformCostTable[] = {
807 // Uniform splats are cheaper for the following instructions.
808 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
809 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
810 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
811
812 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
813 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
814 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
815
816 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
817 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
818 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
819
820 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
821 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
822 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
823 };
824
825 if (ST->hasSSE2() && Op2Info.isUniform() &&
826 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
827 if (const auto *Entry =
828 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
829 if (auto KindCost = Entry->Cost[CostKind])
830 return LT.first * *KindCost;
831
832 static const CostKindTblEntry AVX512DQCostTable[] = {
833 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
834 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
836 };
837
838 // Look for AVX512DQ lowering tricks for custom cases.
839 if (ST->hasDQI())
840 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
841 if (auto KindCost = Entry->Cost[CostKind])
842 return LT.first * *KindCost;
843
844 static const CostKindTblEntry AVX512BWCostTable[] = {
845 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
846 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
847 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
848 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
849 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
850 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
851 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
852 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
853 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
854
855 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
856 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
857 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
858 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
859 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
860 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
861 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
862 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
863 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
864
865 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
866 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
867
868 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
869 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
870 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
871 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
872
873 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
874 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
875
876 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
877 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
878 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
880
881 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
882 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
883 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
884 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
885 };
886
887 // Look for AVX512BW lowering tricks for custom cases.
888 if (ST->hasBWI())
889 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
890 if (auto KindCost = Entry->Cost[CostKind])
891 return LT.first * *KindCost;
892
893 static const CostKindTblEntry AVX512CostTable[] = {
894 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
895 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
896 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
897
898 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
901
902 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
903 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
906 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
909 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
911
912 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
919 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
921
922 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
923 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
924
925 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
926 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
927
928 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
929 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
932
933 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
934 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
937
938 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
939 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
942
943 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
947 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
948
949 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
950
951 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
952 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960
961 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
965
966 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
967 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975
976 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
980 };
981
982 if (ST->hasAVX512())
983 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
984 if (auto KindCost = Entry->Cost[CostKind])
985 return LT.first * *KindCost;
986
987 static const CostKindTblEntry AVX2ShiftCostTable[] = {
988 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
989 // customize them to detect the cases where shift amount is a scalar one.
990 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
991 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
992 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
993 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
994 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
995 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
996 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
997 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
998 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
999 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1000 };
1001
1002 if (ST->hasAVX512()) {
1003 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1004 // On AVX512, a packed v32i16 shift left by a constant build_vector
1005 // is lowered into a vector multiply (vpmullw).
1006 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1007 Op1Info.getNoProps(), Op2Info.getNoProps());
1008 }
1009
1010 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1011 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1013 Op2Info.isConstant())
1014 // On AVX2, a packed v16i16 shift left by a constant build_vector
1015 // is lowered into a vector multiply (vpmullw).
1016 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1017 Op1Info.getNoProps(), Op2Info.getNoProps());
1018
1019 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1020 if (auto KindCost = Entry->Cost[CostKind])
1021 return LT.first * *KindCost;
1022 }
1023
1024 static const CostKindTblEntry XOPShiftCostTable[] = {
1025 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1026 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1027 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1028 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1030 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1031 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1033 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1034 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1038 // 256bit shifts require splitting if AVX2 didn't catch them above.
1039 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1040 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1041 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1043 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1044 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1046 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1047 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1051 };
1052
1053 // Look for XOP lowering tricks.
1054 if (ST->hasXOP()) {
1055 // If the right shift is constant then we'll fold the negation so
1056 // it's as cheap as a left shift.
1057 int ShiftISD = ISD;
1058 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1059 ShiftISD = ISD::SHL;
1060 if (const auto *Entry =
1061 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1062 if (auto KindCost = Entry->Cost[CostKind])
1063 return LT.first * *KindCost;
1064 }
1065
1066 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1067 MVT VT = LT.second;
1068 // Vector shift left by non uniform constant can be lowered
1069 // into vector multiply.
1070 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1072 ISD = ISD::MUL;
1073 }
1074
1075 static const CostKindTblEntry GLMCostTable[] = {
1076 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1077 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1078 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1079 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1080 };
1081
1082 if (ST->useGLMDivSqrtCosts())
1083 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1084 if (auto KindCost = Entry->Cost[CostKind])
1085 return LT.first * *KindCost;
1086
1087 static const CostKindTblEntry SLMCostTable[] = {
1088 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1089 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1090 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1091 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1092 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1093 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1094 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1095 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1096 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1097 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1098 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1099 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1100 // v2i64/v4i64 mul is custom lowered as a series of long:
1101 // multiplies(3), shifts(3) and adds(2)
1102 // slm muldq version throughput is 2 and addq throughput 4
1103 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1104 // 3X4 (addq throughput) = 17
1105 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1106 // slm addq\subq throughput is 4
1107 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1108 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1109 };
1110
1111 if (ST->useSLMArithCosts())
1112 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1113 if (auto KindCost = Entry->Cost[CostKind])
1114 return LT.first * *KindCost;
1115
1116 static const CostKindTblEntry AVX2CostTable[] = {
1117 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1118 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1120 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1121
1122 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1123 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1125 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1126
1127 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1128 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1130 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1132 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1133
1134 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1135 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1136 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1137 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1138 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1139 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1140 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1141 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1142
1143 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1144 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1145 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1146 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1147 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1149 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1150
1151 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1152
1153 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1154 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1155
1156 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1157 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1158 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1159 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1160 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1161 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1162
1163 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1164 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1165 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1166 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1167 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1168 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1169
1170 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1171 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1172 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1173 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1174 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1175 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1176
1177 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1178 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1179 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1180 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1181 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1182 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1183 };
1184
1185 // Look for AVX2 lowering tricks for custom cases.
1186 if (ST->hasAVX2())
1187 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1188 if (auto KindCost = Entry->Cost[CostKind])
1189 return LT.first * *KindCost;
1190
1191 static const CostKindTblEntry AVX1CostTable[] = {
1192 // We don't have to scalarize unsupported ops. We can issue two half-sized
1193 // operations and we only need to extract the upper YMM half.
1194 // Two ops + 1 extract + 1 insert = 4.
1195 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1196 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1197 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1198 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1199 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1200 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1201
1202 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1203 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1206
1207 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1208 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1211
1212 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1213 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1216
1217 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1218 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1219 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1220 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1221 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1222 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1223 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1224 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1225 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1226 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1227
1228 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1229 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1230 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1231 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1232 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1233 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1234 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1235 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1236
1237 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1238 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1239 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1240 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1241 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1242 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1243 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1247 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1248 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1253 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1254
1255 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257
1258 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264
1265 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271
1272 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278
1279 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1285 };
1286
1287 if (ST->hasAVX())
1288 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1289 if (auto KindCost = Entry->Cost[CostKind])
1290 return LT.first * *KindCost;
1291
1292 static const CostKindTblEntry SSE42CostTable[] = {
1293 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297
1298 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302
1303 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307
1308 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312
1313 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1314 };
1315
1316 if (ST->hasSSE42())
1317 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1318 if (auto KindCost = Entry->Cost[CostKind])
1319 return LT.first * *KindCost;
1320
1321 static const CostKindTblEntry SSE41CostTable[] = {
1322 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1323 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1325
1326 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1327 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1332 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1335
1336 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1337 };
1338
1339 if (ST->hasSSE41())
1340 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1341 if (auto KindCost = Entry->Cost[CostKind])
1342 return LT.first * *KindCost;
1343
1344 static const CostKindTblEntry SSSE3CostTable[] = {
1345 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1346 };
1347
1348 if (ST->hasSSSE3())
1349 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSE2CostTable[] = {
1354 // We don't correctly identify costs of casts because they are marked as
1355 // custom.
1356 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1357 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1358 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1359 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1360
1361 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1362 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1363 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1364 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1365
1366 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1367 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1368 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1369 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1370
1371 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1372 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1375
1376 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1377 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1380
1381 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1382 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1385
1386 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1387 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1388
1389 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1390 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1391 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1392 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1393
1394 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1395
1396 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400
1401 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405
1406 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413
1414 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 };
1417
1418 if (ST->hasSSE2())
1419 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1420 if (auto KindCost = Entry->Cost[CostKind])
1421 return LT.first * *KindCost;
1422
1423 static const CostKindTblEntry SSE1CostTable[] = {
1424 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1425 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1426
1427 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432
1433 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 };
1439
1440 if (ST->hasSSE1())
1441 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1446 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1447 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1449 };
1450
1451 if (ST->is64Bit())
1452 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1457 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1458 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1460
1461 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1462 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1464
1465 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1466 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1468
1469 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1470 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1471 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1473 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1474 };
1475
1476 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1477 if (auto KindCost = Entry->Cost[CostKind])
1478 return LT.first * *KindCost;
1479
1480 // It is not a good idea to vectorize division. We have to scalarize it and
1481 // in the process we will often end up having to spilling regular
1482 // registers. The overhead of division is going to dominate most kernels
1483 // anyways so try hard to prevent vectorization of division - it is
1484 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1485 // to hide "20 cycles" for each lane.
1486 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1487 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1488 ISD == ISD::UREM)) {
1489 InstructionCost ScalarCost =
1491 Op1Info.getNoProps(), Op2Info.getNoProps());
1492 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1493 }
1494
1495 // Handle some basic single instruction code size cases.
1496 if (CostKind == TTI::TCK_CodeSize) {
1497 switch (ISD) {
1498 case ISD::FADD:
1499 case ISD::FSUB:
1500 case ISD::FMUL:
1501 case ISD::FDIV:
1502 case ISD::FNEG:
1503 case ISD::AND:
1504 case ISD::OR:
1505 case ISD::XOR:
1506 return LT.first;
1507 break;
1508 }
1509 }
1510
1511 // Fallback to the default implementation.
1512 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1513 Args, CxtI);
1514}
1515
1518 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1520 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1521 return TTI::TCC_Basic;
1523}
1524
1526 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
1528 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1529 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1530 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1531 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1532
1533 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1534
1535 // If all args are constant than this will be constant folded away.
1536 if (!Args.empty() &&
1537 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1538 return TTI::TCC_Free;
1539
1540 // Recognize a basic concat_vector shuffle.
1541 if (Kind == TTI::SK_PermuteTwoSrc &&
1542 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1543 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1546 CostKind, Mask.size() / 2, BaseTp);
1547
1548 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1549 if (Kind == TTI::SK_Transpose)
1550 Kind = TTI::SK_PermuteTwoSrc;
1551
1552 if (Kind == TTI::SK_Broadcast) {
1553 // For Broadcasts we are splatting the first element from the first input
1554 // register, so only need to reference that input and all the output
1555 // registers are the same.
1556 LT.first = 1;
1557
1558 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1559 using namespace PatternMatch;
1560 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1561 (ST->hasAVX2() ||
1562 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1563 return TTI::TCC_Free;
1564 }
1565
1566 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1567 // permutation.
1568 bool IsInLaneShuffle = false;
1569 if (BaseTp->getPrimitiveSizeInBits() > 0 &&
1570 (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
1571 BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1572 Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
1573 unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
1574 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1575 if ((Mask.size() % NumLanes) == 0)
1576 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1577 return P.value() == PoisonMaskElem ||
1578 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1579 (P.index() / NumEltsPerLane);
1580 });
1581 }
1582
1583 // Treat <X x bfloat> shuffles as <X x half>.
1584 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1585 LT.second = LT.second.changeVectorElementType(MVT::f16);
1586
1587 // Subvector extractions are free if they start at the beginning of a
1588 // vector and cheap if the subvectors are aligned.
1589 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1590 int NumElts = LT.second.getVectorNumElements();
1591 if ((Index % NumElts) == 0)
1592 return TTI::TCC_Free;
1593 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1594 if (SubLT.second.isVector()) {
1595 int NumSubElts = SubLT.second.getVectorNumElements();
1596 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1597 return SubLT.first;
1598 // Handle some cases for widening legalization. For now we only handle
1599 // cases where the original subvector was naturally aligned and evenly
1600 // fit in its legalized subvector type.
1601 // FIXME: Remove some of the alignment restrictions.
1602 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1603 // vectors.
1604 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1605 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1606 (NumSubElts % OrigSubElts) == 0 &&
1607 LT.second.getVectorElementType() ==
1608 SubLT.second.getVectorElementType() &&
1609 LT.second.getVectorElementType().getSizeInBits() ==
1611 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1612 "Unexpected number of elements!");
1613 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1614 LT.second.getVectorNumElements());
1615 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1616 SubLT.second.getVectorNumElements());
1617 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1618 InstructionCost ExtractCost = getShuffleCost(
1619 TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
1620
1621 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1622 // if we have SSSE3 we can use pshufb.
1623 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1624 return ExtractCost + 1; // pshufd or pshufb
1625
1626 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1627 "Unexpected vector size");
1628
1629 return ExtractCost + 2; // worst case pshufhw + pshufd
1630 }
1631 }
1632 // If the extract subvector is not optimal, treat it as single op shuffle.
1634 }
1635
1636 // Subvector insertions are cheap if the subvectors are aligned.
1637 // Note that in general, the insertion starting at the beginning of a vector
1638 // isn't free, because we need to preserve the rest of the wide vector,
1639 // but if the destination vector legalizes to the same width as the subvector
1640 // then the insertion will simplify to a (free) register copy.
1641 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1642 int NumElts = LT.second.getVectorNumElements();
1643 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1644 if (SubLT.second.isVector()) {
1645 int NumSubElts = SubLT.second.getVectorNumElements();
1646 bool MatchingTypes =
1647 NumElts == NumSubElts &&
1648 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1649 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1650 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1651 }
1652
1653 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1654 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1655 // v1f32 (legalised to f32) into a v4f32.
1656 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1657 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1658 return 1;
1659
1660 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1661 Kind = TTI::SK_PermuteTwoSrc;
1662 }
1663
1664 // Handle some common (illegal) sub-vector types as they are often very cheap
1665 // to shuffle even on targets without PSHUFB.
1666 EVT VT = TLI->getValueType(DL, BaseTp);
1667 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1668 !ST->hasSSSE3()) {
1669 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1670 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1671 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1672 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1673 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1674 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1675
1676 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1677 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1678 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1679 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1680
1681 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1682 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1683 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1684 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1685
1686 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1687 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1688 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1689 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1690 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1691
1692 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1693 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1694 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1695 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1696 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1697 };
1698
1699 if (ST->hasSSE2())
1700 if (const auto *Entry =
1701 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1702 return Entry->Cost;
1703 }
1704
1705 // We are going to permute multiple sources and the result will be in multiple
1706 // destinations. Providing an accurate cost only for splits where the element
1707 // type remains the same.
1708 if (LT.first != 1) {
1709 MVT LegalVT = LT.second;
1710 if (LegalVT.isVector() &&
1711 LegalVT.getVectorElementType().getSizeInBits() ==
1713 LegalVT.getVectorNumElements() <
1714 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1715 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1716 unsigned LegalVTSize = LegalVT.getStoreSize();
1717 // Number of source vectors after legalization:
1718 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1719 // Number of destination vectors after legalization:
1720 InstructionCost NumOfDests = LT.first;
1721
1722 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1723 LegalVT.getVectorNumElements());
1724
1725 if (!Mask.empty() && NumOfDests.isValid()) {
1726 // Try to perform better estimation of the permutation.
1727 // 1. Split the source/destination vectors into real registers.
1728 // 2. Do the mask analysis to identify which real registers are
1729 // permuted. If more than 1 source registers are used for the
1730 // destination register building, the cost for this destination register
1731 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1732 // source register is used, build mask and calculate the cost as a cost
1733 // of PermuteSingleSrc.
1734 // Also, for the single register permute we try to identify if the
1735 // destination register is just a copy of the source register or the
1736 // copy of the previous destination register (the cost is
1737 // TTI::TCC_Basic). If the source register is just reused, the cost for
1738 // this operation is TTI::TCC_Free.
1739 NumOfDests =
1741 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1742 .first;
1743 unsigned E = *NumOfDests.getValue();
1744 unsigned NormalizedVF =
1745 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1746 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1747 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1748 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1749 copy(Mask, NormalizedMask.begin());
1750 unsigned PrevSrcReg = 0;
1751 ArrayRef<int> PrevRegMask;
1754 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1755 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1756 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1757 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1758 // Check if the previous register can be just copied to the next
1759 // one.
1760 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1761 PrevRegMask != RegMask)
1763 RegMask, CostKind, 0, nullptr);
1764 else
1765 // Just a copy of previous destination register.
1767 return;
1768 }
1769 if (SrcReg != DestReg &&
1770 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1771 // Just a copy of the source register.
1773 }
1774 PrevSrcReg = SrcReg;
1775 PrevRegMask = RegMask;
1776 },
1777 [this, SingleOpTy, CostKind,
1778 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1779 unsigned /*Unused*/, bool /*Unused*/) {
1780 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1781 CostKind, 0, nullptr);
1782 });
1783 return Cost;
1784 }
1785
1786 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1787 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1788 {}, CostKind, 0, nullptr);
1789 }
1790
1791 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1792 }
1793
1794 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1795 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1796 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1797
1798 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1799 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1800
1801 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1802 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1803 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1804 };
1805
1806 if (ST->hasVBMI())
1807 if (const auto *Entry =
1808 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1809 return LT.first * Entry->Cost;
1810
1811 static const CostTblEntry AVX512BWShuffleTbl[] = {
1812 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1813 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1814 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1815
1816 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1817 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1818 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1819 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1820
1821 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1822 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1823 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1824 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1825 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1826
1827 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1828 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1829 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1830 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1831 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1832
1833 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1834 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1835
1836 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1837 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1838 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1839 };
1840
1841 if (ST->hasBWI())
1842 if (const auto *Entry =
1843 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1844 return LT.first * Entry->Cost;
1845
1846 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1847 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1848 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1849 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1850 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1851 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1852 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1853 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1854
1855 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1856 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1857 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1858 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1859 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1860 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1861 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1862
1863 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1864 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1865 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1866 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1867 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1868 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1869 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1870 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1871 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1872 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1873 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1874
1875 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1876 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1877 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1878 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1879 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1880 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1881 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1882 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1883 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1884 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1885 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1886 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1887 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1888
1889 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1890 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1891 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1892 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1893 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1894 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1895 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1896 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1897 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1898 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1899 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1900 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1901
1902 // FIXME: This just applies the type legalization cost rules above
1903 // assuming these completely split.
1904 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1905 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1906 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1907 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1908 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1909 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1910
1911 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1912 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1913 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1914 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1915 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1916 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1917 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1918 };
1919
1920 if (ST->hasAVX512())
1921 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1922 if (auto KindCost = Entry->Cost[CostKind])
1923 return LT.first * *KindCost;
1924
1925 static const CostTblEntry AVX2InLaneShuffleTbl[] = {
1926 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpshufb
1927 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 1}, // vpshufb
1928 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpshufb
1929
1930 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
1931 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
1932 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpshufd + vpblendd
1933 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpshufd + vpblendd
1934 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // 2*vpshufb + vpor
1935 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // 2*vpshufb + vpor
1936 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // 2*vpshufb + vpor
1937 };
1938
1939 if (IsInLaneShuffle && ST->hasAVX2())
1940 if (const auto *Entry =
1941 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1942 return LT.first * Entry->Cost;
1943
1944 static const CostTblEntry AVX2ShuffleTbl[] = {
1945 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1946 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1947 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1948 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1949 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1950 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1951 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1952
1953 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1954 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1955 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1956 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1957 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1958 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1959 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1960
1961 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1962 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1963 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1964
1965 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1966 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1967 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1968 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1969 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1970
1971 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1972 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1973 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1974 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1975 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1976 // + vpblendvb
1977 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1978 // + vpblendvb
1979 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1980 // + vpblendvb
1981
1982 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1983 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1984 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1985 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1986 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1987 // + vpblendvb
1988 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1989 // + vpblendvb
1990 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1991 // + vpblendvb
1992 };
1993
1994 if (ST->hasAVX2())
1995 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1996 return LT.first * Entry->Cost;
1997
1998 static const CostTblEntry XOPShuffleTbl[] = {
1999 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
2000 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
2001 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
2002 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
2003 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
2004 // + vinsertf128
2005 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
2006 // + vinsertf128
2007
2008 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
2009 // + vinsertf128
2010 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
2011 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
2012 // + vinsertf128
2013 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
2014 };
2015
2016 if (ST->hasXOP())
2017 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2018 return LT.first * Entry->Cost;
2019
2020 static const CostTblEntry AVX1InLaneShuffleTbl[] = {
2021 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermilpd
2022 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermilpd
2023 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermilps
2024 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermilps
2025
2026 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2027 // + vpor + vinsertf128
2028 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2029 // + vpor + vinsertf128
2030 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2031 // + vpor + vinsertf128
2032
2033 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
2034 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
2035 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpermilpd + vblendpd
2036 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpermilps + vblendps
2037 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 4*pshufb
2038 // + 2*vpor + vinsertf128
2039 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 9}, // 2*vextractf128 + 4*pshufb
2040 // + 2*vpor + vinsertf128
2041 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 4*pshufb
2042 // + 2*vpor + vinsertf128
2043 };
2044
2045 if (IsInLaneShuffle && ST->hasAVX())
2046 if (const auto *Entry =
2047 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2048 return LT.first * Entry->Cost;
2049
2050 static const CostTblEntry AVX1ShuffleTbl[] = {
2051 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2052 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2053 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2054 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2055 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
2056 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
2057 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
2058
2059 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2060 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2061 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2062 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2063 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2064 // + vinsertf128
2065 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2066 // + vinsertf128
2067 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2068 // + vinsertf128
2069
2070 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
2071 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
2072 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
2073 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
2074 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2075 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2076 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2077
2078 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2079 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2080 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2081 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2082 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2083 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2084 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2085
2086 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2087 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2088 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2089 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2090 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2091 // + 2*por + vinsertf128
2092 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2093 // + 2*por + vinsertf128
2094 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2095 // + 2*por + vinsertf128
2096
2097 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2098 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2099 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2100 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2101 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2102 // + 4*por + vinsertf128
2103 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2104 // + 4*por + vinsertf128
2105 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2106 // + 4*por + vinsertf128
2107 };
2108
2109 if (ST->hasAVX())
2110 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2111 return LT.first * Entry->Cost;
2112
2113 static const CostTblEntry SSE41ShuffleTbl[] = {
2114 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2115 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2116 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2117 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2118 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2119 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2120 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2121 };
2122
2123 if (ST->hasSSE41())
2124 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2125 return LT.first * Entry->Cost;
2126
2127 static const CostTblEntry SSSE3ShuffleTbl[] = {
2128 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2129 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2130 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2131
2132 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2133 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2134 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2135
2136 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2137 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2138 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2139
2140 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2141 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2142 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2143 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2144 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2145
2146 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2147 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2148 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2149
2150 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2151 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2152 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2153 };
2154
2155 if (ST->hasSSSE3())
2156 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2157 return LT.first * Entry->Cost;
2158
2159 static const CostTblEntry SSE2ShuffleTbl[] = {
2160 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2161 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2162 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2163 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2164 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2165 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2166
2167 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2168 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2169 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2170 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2171 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2172 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2173 // + 2*pshufd + 2*unpck + packus
2174
2175 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2176 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2177 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2178 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2179 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2180 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2181
2182 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2183 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2184 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2185 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2186 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2187 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2188
2189 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2190 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2191 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2192 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2193 // + pshufd/unpck
2194 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2195 // + pshufd/unpck
2196 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2197 // + 2*pshufd + 2*unpck + 2*packus
2198
2199 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2200 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2201 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2202 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2203 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2204 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2205 };
2206
2207 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2208 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2209 };
2210
2211 if (ST->hasSSE2()) {
2212 bool IsLoad =
2213 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2214 if (ST->hasSSE3() && IsLoad)
2215 if (const auto *Entry =
2216 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2218 LT.second.getVectorElementCount()) &&
2219 "Table entry missing from isLegalBroadcastLoad()");
2220 return LT.first * Entry->Cost;
2221 }
2222
2223 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2224 return LT.first * Entry->Cost;
2225 }
2226
2227 static const CostTblEntry SSE1ShuffleTbl[] = {
2228 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2229 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2230 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2231 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2232 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2233 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2234 };
2235
2236 if (ST->hasSSE1()) {
2237 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2238 // SHUFPS: both pairs must come from the same source register.
2239 auto MatchSHUFPS = [](int X, int Y) {
2240 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2241 };
2242 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2243 return 1;
2244 }
2245 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2246 return LT.first * Entry->Cost;
2247 }
2248
2249 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2250}
2251
2253 Type *Src,
2256 const Instruction *I) {
2257 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2258 assert(ISD && "Invalid opcode");
2259
2260 // The cost tables include both specific, custom (non-legal) src/dst type
2261 // conversions and generic, legalized types. We test for customs first, before
2262 // falling back to legalization.
2263 // FIXME: Need a better design of the cost table to handle non-simple types of
2264 // potential massive combinations (elem_num x src_type x dst_type).
2265 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2266 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2267 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2268
2269 // Mask sign extend has an instruction.
2270 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2271 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2272 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2273 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2274 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2275 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2276 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2277 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2278 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2279 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2280 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2281 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2282 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2283 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2284 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2285 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2286 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2287
2288 // Mask zero extend is a sext + shift.
2289 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2290 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2291 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2292 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2293 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2294 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2295 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2296 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2297 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2298 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2299 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2300 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2301 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2302 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2303 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2304 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2305 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2306
2307 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2308 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2309 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2310 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2311 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2312 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2313 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2314 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2315 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2316 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2317 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2318 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2319 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2320 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2321 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2322 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2323 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2324
2325 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2326 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2327 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2328 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2329 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2330 };
2331
2332 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2333 // Mask sign extend has an instruction.
2334 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2340 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2341 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2342
2343 // Mask zero extend is a sext + shift.
2344 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2345 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2346 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2347 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2348 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2349 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2350 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2351 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2352
2353 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2354 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2355 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2356 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2357 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2358 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2359 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2360 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2361
2362 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2363 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2364
2365 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2366 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2367
2368 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2369 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2370
2371 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2372 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2373 };
2374
2375 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2376 // 256-bit wide vectors.
2377
2378 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2379 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2380 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2381 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2382 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2383 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2384 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2385 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2386
2387 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2388 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2389 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2390 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2391 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2392 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2393 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2394 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2395 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2396 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2397 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2398 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2399 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2400 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2401 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2402 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2403 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2404 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2405 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2406 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2407 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2408 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2409 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2410 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2411 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2412 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2413 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2414 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2415 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2416 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2417 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2418 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2419 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2420 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2421
2422 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2423 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2424 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2425
2426 // Sign extend is zmm vpternlogd+vptruncdb.
2427 // Zero extend is zmm broadcast load+vptruncdw.
2428 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2429 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2430 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2431 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2432 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2433 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2434 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2435 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2436
2437 // Sign extend is zmm vpternlogd+vptruncdw.
2438 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2439 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2440 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2441 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2442 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2443 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2444 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2445 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2446 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2447
2448 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2449 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2450 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2451 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2452 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2453 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2454 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2455 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2456 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2457 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2458
2459 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2460 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2461 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2462 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2463
2464 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2465 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2466 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2467 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2468 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2469 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2470 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2471 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2472 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2473 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2474
2475 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2476 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2477
2478 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2479 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2480 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2481 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2482 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2483 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2484 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2485 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2486
2487 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2488 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2489 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2490 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2491 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2492 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2493 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2494 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2495 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2496 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2497
2498 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2499 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2500 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2501 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2502 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2503 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2504 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2505 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2506 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2507 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2508 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2509
2510 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2511 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2512 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2513 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2514 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2515 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2516 };
2517
2518 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2519 // Mask sign extend has an instruction.
2520 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2522 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2523 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2524 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2526 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2527 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2528 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2529 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2530 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2531 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2532 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2533 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2534 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2535 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2536 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2537
2538 // Mask zero extend is a sext + shift.
2539 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2540 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2541 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2542 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2543 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2544 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2545 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2546 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2547 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2548 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2549 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2550 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2551 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2552 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2553 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2554 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2555 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2556
2557 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2558 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2559 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2560 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2561 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2562 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2563 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2564 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2565 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2566 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2567 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2568 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2569 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2570 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2571 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2572 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2573 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2574
2575 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2576 };
2577
2578 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2579 // Mask sign extend has an instruction.
2580 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2588
2589 // Mask zero extend is a sext + shift.
2590 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2591 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2592 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2593 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2594 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2595 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2596 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2597 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2598
2599 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2600 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2601 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2602 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2603 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2604 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2605 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2606 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2607
2608 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2609 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2610 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2611 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2612
2613 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2614 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2615 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2616 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2617
2618 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2619 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2620 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2621 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2622
2623 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2624 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2625 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2626 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2627 };
2628
2629 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2630 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2631 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2632 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2633 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2634 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2635 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2636 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2637 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2638 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2639 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2640 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2641 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2642 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2643 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2644 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2645 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2646 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2647 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2648
2649 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2650 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2651 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2653 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2655 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2657 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2659
2660 // sign extend is vpcmpeq+maskedmove+vpmovdw
2661 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2662 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2663 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2664 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2665 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2666 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2667 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2668 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2669 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2670
2671 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2672 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2673 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2674 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2675 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2676 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2677 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2678 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2679
2680 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2681 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2682 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2683 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2684
2685 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2686 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2687 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2688 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2689 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2690 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2691 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2692 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2693 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2694 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2695 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2696 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2697
2698 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2699 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2700 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2701 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2702
2703 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2704 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2705 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2706 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2707 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2708 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2709 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2710 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2711 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2712 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2713 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2714 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2715 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2716
2717 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2718 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2719 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2720
2721 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2722 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2723 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2724 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2725 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2726 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2727 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2728 };
2729
2730 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2731 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2732 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2733 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2734 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2735 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2736 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2737
2738 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2739 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2740 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2741 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2742 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2743 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2744 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2745 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2746 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2747 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2748 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2749 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2750 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2752
2753 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2754
2755 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2756 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2757 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2758 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2759 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2760 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2761 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2762 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2763 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2764 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2765 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2766 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2767
2768 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2769 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2770
2771 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2772 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2773 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2774 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2775
2776 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2777 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2778 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2779 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2780 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2781 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2782 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2783 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2784
2785 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2786 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2787 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2788 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2789 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2790 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2791 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2792
2793 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2794 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2795 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2796 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2797 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2798 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2799 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2800 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2801 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2802 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2803 };
2804
2805 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2806 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2807 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2808 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2809 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2810 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2811 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2812
2813 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2814 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2815 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2816 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2817 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2818 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2819 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2820 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2821 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2822 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2823 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2824 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2825
2826 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2827 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2828 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2829 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2830 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2831
2832 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2833 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2834 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2835 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2836 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2837 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2838 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2839 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2840
2841 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2847 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2848 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2849 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2850 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2851 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2852 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2853
2854 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2864 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2865 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2866 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2867 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2868 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2869 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2870 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2871
2872 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2873 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2874 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2875 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2876 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2877 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2878 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2879 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2880 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2881 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2882 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2883
2884 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2885 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2886 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2887 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2888 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2889 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2890 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2891 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2892 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2893 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2894 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2895 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2896 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2897
2898 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2899 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2900 };
2901
2902 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2903 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2904 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2905 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2906 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2907 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2908 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2909 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2910 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2911 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2912 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2913 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2914 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2915
2916 // These truncates end up widening elements.
2917 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2918 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2919 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2920
2921 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2922 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2923 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2924
2925 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2926 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2927 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2928 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2929 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2930 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2931 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2932 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2933 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2934 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2935 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2936
2937 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2938 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2939 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2940 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2941 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2942 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2943 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2944 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2945 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2946 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2947 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2948 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2949 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2950 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2951
2952 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2953 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2954 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2955 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2956 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2957 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2958 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2959 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2960 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2961 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2962
2963 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2964 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2965 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2966 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2967 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2968 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2969 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2970 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2971 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2972 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2973 };
2974
2975 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2976 // These are somewhat magic numbers justified by comparing the
2977 // output of llvm-mca for our various supported scheduler models
2978 // and basing it off the worst case scenario.
2979 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2980 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2981 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2982 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2983 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2984 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2985 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2986 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2987 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2989 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2990 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2991
2992 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2993 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2994 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2995 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2996 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2997 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2998 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2999 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3004 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3005
3006 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3007 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3008 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3009 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3010 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3011 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3012 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3013 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3014 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3015 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3016
3017 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3018 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3019 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3020 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3021 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3022 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3023 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3024 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3025 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3026 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3027
3028 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3029 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3030 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3031 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3032 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3033 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3034 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3035 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3036 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3037 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3038 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3039 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3040
3041 // These truncates are really widening elements.
3042 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3043 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3044 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3045 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3046 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3047 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3048
3049 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3050 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3051 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3052 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3053 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3054 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3055 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3056 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3057 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3058 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3059 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3060 };
3061
3062 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3063 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3064 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3065 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3066 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3067 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3068 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3069 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3070 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3071 };
3072
3073 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3074 EVT SrcTy = TLI->getValueType(DL, Src);
3075 EVT DstTy = TLI->getValueType(DL, Dst);
3076
3077 // The function getSimpleVT only handles simple value types.
3078 if (SrcTy.isSimple() && DstTy.isSimple()) {
3079 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3080 MVT SimpleDstTy = DstTy.getSimpleVT();
3081
3082 if (ST->useAVX512Regs()) {
3083 if (ST->hasBWI())
3084 if (const auto *Entry = ConvertCostTableLookup(
3085 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3086 if (auto KindCost = Entry->Cost[CostKind])
3087 return *KindCost;
3088
3089 if (ST->hasDQI())
3090 if (const auto *Entry = ConvertCostTableLookup(
3091 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3092 if (auto KindCost = Entry->Cost[CostKind])
3093 return *KindCost;
3094
3095 if (ST->hasAVX512())
3096 if (const auto *Entry = ConvertCostTableLookup(
3097 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3098 if (auto KindCost = Entry->Cost[CostKind])
3099 return *KindCost;
3100 }
3101
3102 if (ST->hasBWI())
3103 if (const auto *Entry = ConvertCostTableLookup(
3104 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3105 if (auto KindCost = Entry->Cost[CostKind])
3106 return *KindCost;
3107
3108 if (ST->hasDQI())
3109 if (const auto *Entry = ConvertCostTableLookup(
3110 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3111 if (auto KindCost = Entry->Cost[CostKind])
3112 return *KindCost;
3113
3114 if (ST->hasAVX512())
3115 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3116 SimpleDstTy, SimpleSrcTy))
3117 if (auto KindCost = Entry->Cost[CostKind])
3118 return *KindCost;
3119
3120 if (ST->hasAVX2()) {
3121 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3122 SimpleDstTy, SimpleSrcTy))
3123 if (auto KindCost = Entry->Cost[CostKind])
3124 return *KindCost;
3125 }
3126
3127 if (ST->hasAVX()) {
3128 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3129 SimpleDstTy, SimpleSrcTy))
3130 if (auto KindCost = Entry->Cost[CostKind])
3131 return *KindCost;
3132 }
3133
3134 if (ST->hasF16C()) {
3135 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3136 SimpleDstTy, SimpleSrcTy))
3137 if (auto KindCost = Entry->Cost[CostKind])
3138 return *KindCost;
3139 }
3140
3141 if (ST->hasSSE41()) {
3142 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3143 SimpleDstTy, SimpleSrcTy))
3144 if (auto KindCost = Entry->Cost[CostKind])
3145 return *KindCost;
3146 }
3147
3148 if (ST->hasSSE2()) {
3149 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3150 SimpleDstTy, SimpleSrcTy))
3151 if (auto KindCost = Entry->Cost[CostKind])
3152 return *KindCost;
3153 }
3154
3155 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3156 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3157 // fp16 conversions not covered by any table entries require a libcall.
3158 // Return a large (arbitrary) number to model this.
3159 return InstructionCost(64);
3160 }
3161 }
3162
3163 // Fall back to legalized types.
3164 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3165 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3166
3167 // If we're truncating to the same legalized type - just assume its free.
3168 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3169 return TTI::TCC_Free;
3170
3171 if (ST->useAVX512Regs()) {
3172 if (ST->hasBWI())
3173 if (const auto *Entry = ConvertCostTableLookup(
3174 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3175 if (auto KindCost = Entry->Cost[CostKind])
3176 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3177
3178 if (ST->hasDQI())
3179 if (const auto *Entry = ConvertCostTableLookup(
3180 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3181 if (auto KindCost = Entry->Cost[CostKind])
3182 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3183
3184 if (ST->hasAVX512())
3185 if (const auto *Entry = ConvertCostTableLookup(
3186 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3187 if (auto KindCost = Entry->Cost[CostKind])
3188 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3189 }
3190
3191 if (ST->hasBWI())
3192 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3193 LTDest.second, LTSrc.second))
3194 if (auto KindCost = Entry->Cost[CostKind])
3195 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3196
3197 if (ST->hasDQI())
3198 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3199 LTDest.second, LTSrc.second))
3200 if (auto KindCost = Entry->Cost[CostKind])
3201 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3202
3203 if (ST->hasAVX512())
3204 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3205 LTDest.second, LTSrc.second))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3208
3209 if (ST->hasAVX2())
3210 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3211 LTDest.second, LTSrc.second))
3212 if (auto KindCost = Entry->Cost[CostKind])
3213 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3214
3215 if (ST->hasAVX())
3216 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3217 LTDest.second, LTSrc.second))
3218 if (auto KindCost = Entry->Cost[CostKind])
3219 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3220
3221 if (ST->hasF16C()) {
3222 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3223 LTDest.second, LTSrc.second))
3224 if (auto KindCost = Entry->Cost[CostKind])
3225 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3226 }
3227
3228 if (ST->hasSSE41())
3229 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3230 LTDest.second, LTSrc.second))
3231 if (auto KindCost = Entry->Cost[CostKind])
3232 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3233
3234 if (ST->hasSSE2())
3235 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3236 LTDest.second, LTSrc.second))
3237 if (auto KindCost = Entry->Cost[CostKind])
3238 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3239
3240 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3241 // sitofp.
3242 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3243 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3244 Type *ExtSrc = Src->getWithNewBitWidth(32);
3245 unsigned ExtOpc =
3246 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3247
3248 // For scalar loads the extend would be free.
3249 InstructionCost ExtCost = 0;
3250 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3251 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3252
3253 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3255 }
3256
3257 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3258 // i32.
3259 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3260 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3261 Type *TruncDst = Dst->getWithNewBitWidth(32);
3262 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3263 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3265 }
3266
3267 // TODO: Allow non-throughput costs that aren't binary.
3268 auto AdjustCost = [&CostKind](InstructionCost Cost,
3271 return Cost == 0 ? 0 : N;
3272 return Cost * N;
3273 };
3274 return AdjustCost(
3275 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3276}
3277
3279 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3281 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3282 // Early out if this type isn't scalar/vector integer/float.
3283 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3284 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3285 Op1Info, Op2Info, I);
3286
3287 // Legalize the type.
3288 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3289
3290 MVT MTy = LT.second;
3291
3292 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3293 assert(ISD && "Invalid opcode");
3294
3295 InstructionCost ExtraCost = 0;
3296 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3297 // Some vector comparison predicates cost extra instructions.
3298 // TODO: Adjust ExtraCost based on CostKind?
3299 // TODO: Should we invert this and assume worst case cmp costs
3300 // and reduce for particular predicates?
3301 if (MTy.isVector() &&
3302 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3303 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3304 ST->hasBWI())) {
3305 // Fallback to I if a specific predicate wasn't specified.
3306 CmpInst::Predicate Pred = VecPred;
3307 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3309 Pred = cast<CmpInst>(I)->getPredicate();
3310
3311 bool CmpWithConstant = false;
3312 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3313 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3314
3315 switch (Pred) {
3317 // xor(cmpeq(x,y),-1)
3318 ExtraCost = CmpWithConstant ? 0 : 1;
3319 break;
3322 // xor(cmpgt(x,y),-1)
3323 ExtraCost = CmpWithConstant ? 0 : 1;
3324 break;
3327 // cmpgt(xor(x,signbit),xor(y,signbit))
3328 // xor(cmpeq(pmaxu(x,y),x),-1)
3329 ExtraCost = CmpWithConstant ? 1 : 2;
3330 break;
3333 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3334 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3335 // cmpeq(psubus(x,y),0)
3336 // cmpeq(pminu(x,y),x)
3337 ExtraCost = 1;
3338 } else {
3339 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3340 ExtraCost = CmpWithConstant ? 2 : 3;
3341 }
3342 break;
3345 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3346 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3347 if (CondTy && !ST->hasAVX())
3348 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3350 Op1Info, Op2Info) +
3351 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3353 Op1Info, Op2Info) +
3354 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3355
3356 break;
3359 // Assume worst case scenario and add the maximum extra cost.
3360 ExtraCost = 3;
3361 break;
3362 default:
3363 break;
3364 }
3365 }
3366 }
3367
3368 static const CostKindTblEntry SLMCostTbl[] = {
3369 // slm pcmpeq/pcmpgt throughput is 2
3370 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3371 // slm pblendvb/blendvpd/blendvps throughput is 4
3372 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3373 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3374 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3375 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3376 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3377 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3378 };
3379
3380 static const CostKindTblEntry AVX512BWCostTbl[] = {
3381 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3382 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3383 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3384 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3385
3386 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3387 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3388 };
3389
3390 static const CostKindTblEntry AVX512CostTbl[] = {
3391 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3392 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3393 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3394 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3395
3396 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3397 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3398 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3399 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3400 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3401 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3402 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3403
3404 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3405 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3406 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3407 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3408 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3409 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3410 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3411 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3412 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3413 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3414 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3415 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3416 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3417 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3418
3419 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3420 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3421 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3422 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3423 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3424 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3425 };
3426
3427 static const CostKindTblEntry AVX2CostTbl[] = {
3428 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3429 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3430 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3431 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3432 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3433 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3434
3435 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3436 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3437 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3438 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3439
3440 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3441 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3442 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3443 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3444 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3445 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3446 };
3447
3448 static const CostKindTblEntry XOPCostTbl[] = {
3449 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3450 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3451 };
3452
3453 static const CostKindTblEntry AVX1CostTbl[] = {
3454 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3455 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3456 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3457 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3458 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3459 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3460
3461 // AVX1 does not support 8-wide integer compare.
3462 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3463 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3464 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3465 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3466
3467 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3468 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3469 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3470 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3471 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3472 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3473 };
3474
3475 static const CostKindTblEntry SSE42CostTbl[] = {
3476 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3477 };
3478
3479 static const CostKindTblEntry SSE41CostTbl[] = {
3480 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3481 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3482
3483 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3484 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3485 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3486 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3487 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3488 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3489 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3490 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3491 };
3492
3493 static const CostKindTblEntry SSE2CostTbl[] = {
3494 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3495 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3496
3497 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3498 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3499 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3500 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3501
3502 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3503 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3504 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3505 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3506 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3507 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3508 };
3509
3510 static const CostKindTblEntry SSE1CostTbl[] = {
3511 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3512 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3513
3514 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3515 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3516 };
3517
3518 if (ST->useSLMArithCosts())
3519 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3520 if (auto KindCost = Entry->Cost[CostKind])
3521 return LT.first * (ExtraCost + *KindCost);
3522
3523 if (ST->hasBWI())
3524 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3525 if (auto KindCost = Entry->Cost[CostKind])
3526 return LT.first * (ExtraCost + *KindCost);
3527
3528 if (ST->hasAVX512())
3529 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3530 if (auto KindCost = Entry->Cost[CostKind])
3531 return LT.first * (ExtraCost + *KindCost);
3532
3533 if (ST->hasAVX2())
3534 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3535 if (auto KindCost = Entry->Cost[CostKind])
3536 return LT.first * (ExtraCost + *KindCost);
3537
3538 if (ST->hasXOP())
3539 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3540 if (auto KindCost = Entry->Cost[CostKind])
3541 return LT.first * (ExtraCost + *KindCost);
3542
3543 if (ST->hasAVX())
3544 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3545 if (auto KindCost = Entry->Cost[CostKind])
3546 return LT.first * (ExtraCost + *KindCost);
3547
3548 if (ST->hasSSE42())
3549 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3550 if (auto KindCost = Entry->Cost[CostKind])
3551 return LT.first * (ExtraCost + *KindCost);
3552
3553 if (ST->hasSSE41())
3554 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3555 if (auto KindCost = Entry->Cost[CostKind])
3556 return LT.first * (ExtraCost + *KindCost);
3557
3558 if (ST->hasSSE2())
3559 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3560 if (auto KindCost = Entry->Cost[CostKind])
3561 return LT.first * (ExtraCost + *KindCost);
3562
3563 if (ST->hasSSE1())
3564 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3565 if (auto KindCost = Entry->Cost[CostKind])
3566 return LT.first * (ExtraCost + *KindCost);
3567
3568 // Assume a 3cy latency for fp select ops.
3569 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3570 if (ValTy->getScalarType()->isFloatingPointTy())
3571 return 3;
3572
3573 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3574 Op1Info, Op2Info, I);
3575}
3576
3578
3582 // Costs should match the codegen from:
3583 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3584 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3585 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3586 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3587 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3588
3589 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3590 // specialized in these tables yet.
3591 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3592 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3593 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3594 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3595 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3596 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3597 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3598 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3599 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3600 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3601 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3602 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3603 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3604 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3605 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3606 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3607 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3608 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3609 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3610 };
3611 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3612 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3613 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3614 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3615 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3616 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3617 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3618 };
3619 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3620 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3621 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3622 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3623 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3624 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3625 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3626 };
3627 static const CostKindTblEntry AVX512CDCostTbl[] = {
3628 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3629 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3630 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3631 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3632 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3633 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3634 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3635 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3636 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3637 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3638 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3639 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3640
3641 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3642 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3643 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3644 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3645 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3646 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3647 };
3648 static const CostKindTblEntry AVX512BWCostTbl[] = {
3649 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3650 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3651 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3652 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3653 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3654 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3655 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3656 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3657 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3658 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3659 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3660 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3661 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3662 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3663 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3664 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3665 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3666 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3667 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3668 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3669 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3670 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3671 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3672 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3673 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3674 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3675 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3676 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3677 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3678 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3679 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3680 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3681 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3682 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3683 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3684 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3685 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3686 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3687 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3688 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3689 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3690 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3691 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3692 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3693 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3694 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3695 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3696 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3697 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3698 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3699 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3700 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3701 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3702 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3703 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3704 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3705 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3706 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3707 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3708 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3709 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3710 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3711 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3712 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3713 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3714 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3715 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3716 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3717 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3718 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3719 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3720 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3721 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3722 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3723 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3724 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3725 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3726 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3727 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3728 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3729 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3730 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3731 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3732 };
3733 static const CostKindTblEntry AVX512CostTbl[] = {
3734 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3735 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3736 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3737 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3738 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3739 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3740 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3741 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3742 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3743 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3744 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3745 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3746 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3747 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3748 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3749 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3750 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3751 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3752 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3753 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3754 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3755 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3756 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3757 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3758 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3759 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3760 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3761 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3762 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3763 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3764 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3765 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3766 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3767 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3768 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3769 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3770 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3771 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3772 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3773 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3774 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3775 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3776 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3777 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3778 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3779 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3780 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3781 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3782 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3783 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3784 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3785 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3786 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3787 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3788 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3789 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3790 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3791 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3792 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3793 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3794 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3795 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3796 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3797 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3798 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3799 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3800 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3801 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3802 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3803 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3804 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3805 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3806 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3807 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3808 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3809 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3810 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3811 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3812 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3813 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3814 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3815 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3816 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3817 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3818 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3819 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3820 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3821 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3822 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3823 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3824 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3825 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3826 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3827 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3828 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3829 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3830 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3831 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3832 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3833 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3834 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3835 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3836 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3837 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3838 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3839 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3840 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3841 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3842 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3843 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3844 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3845 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3846 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3847 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3848 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3849 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3850 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3851 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3852 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3853 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3854 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3855 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3856 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3857 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3858 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3859 };
3860 static const CostKindTblEntry XOPCostTbl[] = {
3861 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3862 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3863 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3864 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3865 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3866 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3867 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3868 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3869 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3870 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3871 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3872 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3873 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3874 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3875 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3876 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3877 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3878 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3879 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3880 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3881 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3882 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3883 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3884 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3885 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3886 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3887 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3888 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3889 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3890 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3891 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3892 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3893 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3894 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3895 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3896 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3897 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3898 };
3899 static const CostKindTblEntry AVX2CostTbl[] = {
3900 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3901 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3902 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3903 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3904 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3905 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3906 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3907 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3908 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3909 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3910 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3911 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3912 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3913 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3914 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3915 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3916 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3917 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3918 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3919 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3920 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3921 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3922 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3923 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3924 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3925 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3926 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3927 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3928 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3929 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3930 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3931 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3932 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3933 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3934 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3935 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3936 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3937 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3938 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3939 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3940 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3941 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3942 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3943 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3944 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3945 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3946 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
3947 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
3948 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
3949 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
3950 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3951 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3952 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3953 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3954 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3955 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3956 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3957 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3958 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3959 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3960 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3961 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3962 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3963 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
3964 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
3965 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
3966 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
3967 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3968 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
3969 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
3970 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
3971 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
3972 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
3973 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
3974 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3975 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3976 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
3977 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
3978 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
3979 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3980 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3981 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3982 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3983 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3984 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3985 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3986 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3987 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3988 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3989 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3990 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3991 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
3992 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
3993 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
3994 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
3995 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
3996 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
3997 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
3998 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
3999 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4000 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4001 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4002 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4003 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4004 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4005 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4006 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4007 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4008 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4009 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4010 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4011 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4012 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4013 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4014 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4015 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4016 };
4017 static const CostKindTblEntry AVX1CostTbl[] = {
4018 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4019 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4020 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4021 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4022 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4023 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4024 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4025 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4026 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4027 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4028 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4029 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4030 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4031 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4032 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4033 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4034 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4035 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4036 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4037 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4038 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4039 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4040 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4041 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4042 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4043 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4044 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4045 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4046 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4047 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4048 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4049 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4050 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4051 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4052 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4053 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4054 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4055 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4056 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4057 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4058 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4059 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4060 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4061 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4062 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4063 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4064 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4065 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4066 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4067 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4068 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4069 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4070 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4071 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4072 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4073 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4074 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4075 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4076 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4077 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4078 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4079 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4080 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4081 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4082 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4083 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4084 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4085 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4086 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4087 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4088 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4089 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4090 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4091 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4093 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4094 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4095 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4096 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4097 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4099 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4100 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4101 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4103 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4104 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4105 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4106 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4107 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4108 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4109 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4110 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4111 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4112 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4113 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4114 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4115 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4116 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4118 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4119 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4120 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4121 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4122 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4123 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4124 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4125 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4126 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4127 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4128 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4129 };
4130 static const CostKindTblEntry GFNICostTbl[] = {
4131 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4132 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4133 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4134 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4135 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4136 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4137 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4138 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4139 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4140 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4141 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4142 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4143 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4144 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4145 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4146 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4147 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4148 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4149 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4150 };
4151 static const CostKindTblEntry GLMCostTbl[] = {
4152 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4153 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4154 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4155 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4156 };
4157 static const CostKindTblEntry SLMCostTbl[] = {
4158 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4159 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4160 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4161 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4162 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4163 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4164 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4165 };
4166 static const CostKindTblEntry SSE42CostTbl[] = {
4167 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4168 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4169 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4170 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4171 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4172 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4173 };
4174 static const CostKindTblEntry SSE41CostTbl[] = {
4175 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4176 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4177 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4178 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4179 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4180 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4181 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4182 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4183 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4184 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4185 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4186 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4187 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4188 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4189 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4190 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4191 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4192 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4193 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4194 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4195 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4196 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4197 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4198 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4199 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4200 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4201 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4202 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4203 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4204 };
4205 static const CostKindTblEntry SSSE3CostTbl[] = {
4206 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4207 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4208 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4209 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4210 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4211 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4212 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4213 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4214 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4215 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4216 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4217 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4218 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4219 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4220 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4221 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4222 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4223 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4224 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4225 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4226 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4227 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4228 };
4229 static const CostKindTblEntry SSE2CostTbl[] = {
4230 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4231 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4232 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4233 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4234 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4235 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4236 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4237 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4238 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4239 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4240 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4241 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4242 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4243 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4244 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4245 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4246 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4247 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4248 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4249 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4250 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4251 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4252 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4253 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4254 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4255 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4256 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4257 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4258 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4259 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4260 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4261 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4262 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4263 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4264 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4265 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4266 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4267 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4268 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4269 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4270 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4271 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4272 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4273 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4274 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4275 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4276 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4277 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4278 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4279 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4280 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4281 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4282 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4283 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4284 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4285 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4286 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4287 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4288 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4289 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4290 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4291 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4292 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4293 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4294 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4295 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4296 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4297 };
4298 static const CostKindTblEntry SSE1CostTbl[] = {
4299 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4300 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4301 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4302 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4303 };
4304 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4305 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4306 };
4307 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4308 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4309 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4310 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4311 };
4312 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4313 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4314 };
4315 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4316 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4317 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4318 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4319 };
4320 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4321 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4322 };
4323 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4324 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4325 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4326 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4327 };
4328 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4329 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4330 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4331 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4332 { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4333 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4334 { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
4335 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4336 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4337 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4338 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4339 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4340 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4341 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4342 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4343 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4344 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4345 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4346 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4347 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4348 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4349 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4350 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4351 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4352 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4353 };
4354 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4355 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4356 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4357 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4358 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4359 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4360 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4361 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4362 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4363 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4364 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4365 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4366 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4367 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4368 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4369 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4370 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4371 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4372 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4373 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4374 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4375 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4376 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4377 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4378 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4379 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4380 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4381 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4382 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4383 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4384 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4385 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4386 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4387 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4388 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4389 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4390 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4391 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4392 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4393 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4394 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4395 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4396 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4397 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4398 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4399 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4400 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4401 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4402 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4403 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4404 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4405 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4406 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4407 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4408 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4409 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4410 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4411 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4412 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4413 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4414 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4415 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4416 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4417 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4418 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4419 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4420 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4421 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4422 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4423 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4424 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4425 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4426 };
4427
4428 Type *RetTy = ICA.getReturnType();
4429 Type *OpTy = RetTy;
4430 Intrinsic::ID IID = ICA.getID();
4431 unsigned ISD = ISD::DELETED_NODE;
4432 switch (IID) {
4433 default:
4434 break;
4435 case Intrinsic::abs:
4436 ISD = ISD::ABS;
4437 break;
4438 case Intrinsic::bitreverse:
4439 ISD = ISD::BITREVERSE;
4440 break;
4441 case Intrinsic::bswap:
4442 ISD = ISD::BSWAP;
4443 break;
4444 case Intrinsic::ctlz:
4445 ISD = ISD::CTLZ;
4446 break;
4447 case Intrinsic::ctpop:
4448 ISD = ISD::CTPOP;
4449 break;
4450 case Intrinsic::cttz:
4451 ISD = ISD::CTTZ;
4452 break;
4453 case Intrinsic::fshl:
4454 ISD = ISD::FSHL;
4455 if (!ICA.isTypeBasedOnly()) {
4456 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4457 if (Args[0] == Args[1]) {
4458 ISD = ISD::ROTL;
4459 // Handle uniform constant rotation amounts.
4460 // TODO: Handle funnel-shift cases.
4461 const APInt *Amt;
4462 if (Args[2] &&
4464 ISD = X86ISD::VROTLI;
4465 }
4466 }
4467 break;
4468 case Intrinsic::fshr:
4469 // FSHR has same costs so don't duplicate.
4470 ISD = ISD::FSHL;
4471 if (!ICA.isTypeBasedOnly()) {
4472 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4473 if (Args[0] == Args[1]) {
4474 ISD = ISD::ROTR;
4475 // Handle uniform constant rotation amount.
4476 // TODO: Handle funnel-shift cases.
4477 const APInt *Amt;
4478 if (Args[2] &&
4480 ISD = X86ISD::VROTLI;
4481 }
4482 }
4483 break;
4484 case Intrinsic::lrint:
4485 case Intrinsic::llrint: {
4486 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4487 // have the same costs as the CVTTP2SI (fptosi) instructions
4488 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4489 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4491 }
4492 case Intrinsic::maxnum:
4493 case Intrinsic::minnum:
4494 // FMINNUM has same costs so don't duplicate.
4495 ISD = ISD::FMAXNUM;
4496 break;
4497 case Intrinsic::sadd_sat:
4498 ISD = ISD::SADDSAT;
4499 break;
4500 case Intrinsic::smax:
4501 ISD = ISD::SMAX;
4502 break;
4503 case Intrinsic::smin:
4504 ISD = ISD::SMIN;
4505 break;
4506 case Intrinsic::ssub_sat:
4507 ISD = ISD::SSUBSAT;
4508 break;
4509 case Intrinsic::uadd_sat:
4510 ISD = ISD::UADDSAT;
4511 break;
4512 case Intrinsic::umax:
4513 ISD = ISD::UMAX;
4514 break;
4515 case Intrinsic::umin:
4516 ISD = ISD::UMIN;
4517 break;
4518 case Intrinsic::usub_sat:
4519 ISD = ISD::USUBSAT;
4520 break;
4521 case Intrinsic::sqrt:
4522 ISD = ISD::FSQRT;
4523 break;
4524 case Intrinsic::sadd_with_overflow:
4525 case Intrinsic::ssub_with_overflow:
4526 // SSUBO has same costs so don't duplicate.
4527 ISD = ISD::SADDO;
4528 OpTy = RetTy->getContainedType(0);
4529 break;
4530 case Intrinsic::uadd_with_overflow:
4531 case Intrinsic::usub_with_overflow:
4532 // USUBO has same costs so don't duplicate.
4533 ISD = ISD::UADDO;
4534 OpTy = RetTy->getContainedType(0);
4535 break;
4536 case Intrinsic::smul_with_overflow:
4537 ISD = ISD::SMULO;
4538 OpTy = RetTy->getContainedType(0);
4539 break;
4540 case Intrinsic::umul_with_overflow:
4541 ISD = ISD::UMULO;
4542 OpTy = RetTy->getContainedType(0);
4543 break;
4544 }
4545
4546 if (ISD != ISD::DELETED_NODE) {
4547 auto adjustTableCost = [&](int ISD, unsigned Cost,
4548 std::pair<InstructionCost, MVT> LT,
4550 InstructionCost LegalizationCost = LT.first;
4551 MVT MTy = LT.second;
4552
4553 // If there are no NANs to deal with, then these are reduced to a
4554 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4555 // assume is used in the non-fast case.
4556 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4557 if (FMF.noNaNs())
4558 return LegalizationCost * 1;
4559 }
4560
4561 // For cases where some ops can be folded into a load/store, assume free.
4562 if (MTy.isScalarInteger()) {
4563 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4564 if (const Instruction *II = ICA.getInst()) {
4565 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4566 return TTI::TCC_Free;
4567 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4568 if (LI->hasOneUse())
4569 return TTI::TCC_Free;
4570 }
4571 }
4572 }
4573 }
4574
4575 return LegalizationCost * (int)Cost;
4576 };
4577
4578 // Legalize the type.
4579 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4580 MVT MTy = LT.second;
4581
4582 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4583 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4584 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4585 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4586 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4587 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4588 if (Cst->isAllOnesValue())
4590 }
4591
4592 // FSQRT is a single instruction.
4593 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4594 return LT.first;
4595
4596 if (ST->useGLMDivSqrtCosts())
4597 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4598 if (auto KindCost = Entry->Cost[CostKind])
4599 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4600
4601 if (ST->useSLMArithCosts())
4602 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4603 if (auto KindCost = Entry->Cost[CostKind])
4604 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4605
4606 if (ST->hasVBMI2())
4607 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4608 if (auto KindCost = Entry->Cost[CostKind])
4609 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4610
4611 if (ST->hasBITALG())
4612 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4613 if (auto KindCost = Entry->Cost[CostKind])
4614 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4615
4616 if (ST->hasVPOPCNTDQ())
4617 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4618 if (auto KindCost = Entry->Cost[CostKind])
4619 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4620
4621 if (ST->hasGFNI())
4622 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4623 if (auto KindCost = Entry->Cost[CostKind])
4624 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4625
4626 if (ST->hasCDI())
4627 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4628 if (auto KindCost = Entry->Cost[CostKind])
4629 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4630
4631 if (ST->hasBWI())
4632 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4633 if (auto KindCost = Entry->Cost[CostKind])
4634 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4635
4636 if (ST->hasAVX512())
4637 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4638 if (auto KindCost = Entry->Cost[CostKind])
4639 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4640
4641 if (ST->hasXOP())
4642 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4643 if (auto KindCost = Entry->Cost[CostKind])
4644 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4645
4646 if (ST->hasAVX2())
4647 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4648 if (auto KindCost = Entry->Cost[CostKind])
4649 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4650
4651 if (ST->hasAVX())
4652 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4653 if (auto KindCost = Entry->Cost[CostKind])
4654 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4655
4656 if (ST->hasSSE42())
4657 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4658 if (auto KindCost = Entry->Cost[CostKind])
4659 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4660
4661 if (ST->hasSSE41())
4662 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4663 if (auto KindCost = Entry->Cost[CostKind])
4664 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4665
4666 if (ST->hasSSSE3())
4667 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4668 if (auto KindCost = Entry->Cost[CostKind])
4669 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4670
4671 if (ST->hasSSE2())
4672 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4673 if (auto KindCost = Entry->Cost[CostKind])
4674 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4675
4676 if (ST->hasSSE1())
4677 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4678 if (auto KindCost = Entry->Cost[CostKind])
4679 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4680
4681 if (ST->hasBMI()) {
4682 if (ST->is64Bit())
4683 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4684 if (auto KindCost = Entry->Cost[CostKind])
4685 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4686
4687 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4688 if (auto KindCost = Entry->Cost[CostKind])
4689 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4690 }
4691
4692 if (ST->hasLZCNT()) {
4693 if (ST->is64Bit())
4694 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4695 if (auto KindCost = Entry->Cost[CostKind])
4696 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4697
4698 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4699 if (auto KindCost = Entry->Cost[CostKind])
4700 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4701 }
4702
4703 if (ST->hasPOPCNT()) {
4704 if (ST->is64Bit())
4705 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4706 if (auto KindCost = Entry->Cost[CostKind])
4707 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4708
4709 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4710 if (auto KindCost = Entry->Cost[CostKind])
4711 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4712 }
4713
4714 if (ST->is64Bit())
4715 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4716 if (auto KindCost = Entry->Cost[CostKind])
4717 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4718
4719 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4720 if (auto KindCost = Entry->Cost[CostKind])
4721 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4722
4723 // Without arg data, we need to compute the expanded costs of custom lowered
4724 // intrinsics to prevent use of the (very low) default costs.
4725 if (ICA.isTypeBasedOnly() &&
4726 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4727 Type *CondTy = RetTy->getWithNewBitWidth(1);
4729 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4730 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4731 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4732 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4733 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4734 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4736 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4738 return Cost;
4739 }
4740 }
4741
4743}
4744
4747 unsigned Index, Value *Op0,
4748 Value *Op1) {
4749 static const CostTblEntry SLMCostTbl[] = {
4750 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4751 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4752 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4753 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4754 };
4755
4756 assert(Val->isVectorTy() && "This must be a vector type");
4757 Type *ScalarType = Val->getScalarType();
4758 InstructionCost RegisterFileMoveCost = 0;
4759
4760 // Non-immediate extraction/insertion can be handled as a sequence of
4761 // aliased loads+stores via the stack.
4762 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4763 Opcode == Instruction::InsertElement)) {
4764 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4765 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4766
4767 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4768 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4769 Align VecAlign = DL.getPrefTypeAlign(Val);
4770 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4771
4772 // Extract - store vector to stack, load scalar.
4773 if (Opcode == Instruction::ExtractElement) {
4774 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4775 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4776 CostKind);
4777 }
4778 // Insert - store vector to stack, store scalar, load vector.
4779 if (Opcode == Instruction::InsertElement) {
4780 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4781 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4782 CostKind) +
4783 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4784 }
4785 }
4786
4787 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4788 Opcode == Instruction::InsertElement)) {
4789 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4790 if (Opcode == Instruction::ExtractElement &&
4791 ScalarType->getScalarSizeInBits() == 1 &&
4792 cast<FixedVectorType>(Val)->getNumElements() > 1)
4793 return 1;
4794
4795 // Legalize the type.
4796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4797
4798 // This type is legalized to a scalar type.
4799 if (!LT.second.isVector())
4800 return TTI::TCC_Free;
4801
4802 // The type may be split. Normalize the index to the new type.
4803 unsigned SizeInBits = LT.second.getSizeInBits();
4804 unsigned NumElts = LT.second.getVectorNumElements();
4805 unsigned SubNumElts = NumElts;
4806 Index = Index % NumElts;
4807
4808 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4809 // For inserts, we also need to insert the subvector back.
4810 if (SizeInBits > 128) {
4811 assert((SizeInBits % 128) == 0 && "Illegal vector");
4812 unsigned NumSubVecs = SizeInBits / 128;
4813 SubNumElts = NumElts / NumSubVecs;
4814 if (SubNumElts <= Index) {
4815 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4816 Index %= SubNumElts;
4817 }
4818 }
4819
4820 MVT MScalarTy = LT.second.getScalarType();
4821 auto IsCheapPInsrPExtrInsertPS = [&]() {
4822 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4823 // Inserting f32 into index0 is just movss.
4824 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4825 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4826 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4827 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4828 Opcode == Instruction::InsertElement) ||
4829 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4830 Opcode == Instruction::InsertElement);
4831 };
4832
4833 if (Index == 0) {
4834 // Floating point scalars are already located in index #0.
4835 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4836 // true for all.
4837 if (ScalarType->isFloatingPointTy() &&
4838 (Opcode != Instruction::InsertElement || !Op0 ||
4839 isa<UndefValue>(Op0)))
4840 return RegisterFileMoveCost;
4841
4842 if (Opcode == Instruction::InsertElement &&
4843 isa_and_nonnull<UndefValue>(Op0)) {
4844 // Consider the gather cost to be cheap.
4845 if (isa_and_nonnull<LoadInst>(Op1))
4846 return RegisterFileMoveCost;
4847 if (!IsCheapPInsrPExtrInsertPS()) {
4848 // mov constant-to-GPR + movd/movq GPR -> XMM.
4849 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4850 return 2 + RegisterFileMoveCost;
4851 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4852 return 1 + RegisterFileMoveCost;
4853 }
4854 }
4855
4856 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4857 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4858 return 1 + RegisterFileMoveCost;
4859 }
4860
4861 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4862 assert(ISD && "Unexpected vector opcode");
4863 if (ST->useSLMArithCosts())
4864 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4865 return Entry->Cost + RegisterFileMoveCost;
4866
4867 // Consider cheap cases.
4868 if (IsCheapPInsrPExtrInsertPS())
4869 return 1 + RegisterFileMoveCost;
4870
4871 // For extractions we just need to shuffle the element to index 0, which
4872 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4873 // the elements to its destination. In both cases we must handle the
4874 // subvector move(s).
4875 // If the vector type is already less than 128-bits then don't reduce it.
4876 // TODO: Under what circumstances should we shuffle using the full width?
4877 InstructionCost ShuffleCost = 1;
4878 if (Opcode == Instruction::InsertElement) {
4879 auto *SubTy = cast<VectorType>(Val);
4880 EVT VT = TLI->getValueType(DL, Val);
4881 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4882 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4883 ShuffleCost =
4884 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
4885 }
4886 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4887 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4888 }
4889
4890 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4891 RegisterFileMoveCost;
4892}
4893
4895 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4897 assert(DemandedElts.getBitWidth() ==
4898 cast<FixedVectorType>(Ty)->getNumElements() &&
4899 "Vector size mismatch");
4900
4901 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4902 MVT MScalarTy = LT.second.getScalarType();
4903 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4905
4906 constexpr unsigned LaneBitWidth = 128;
4907 assert((LegalVectorBitWidth < LaneBitWidth ||
4908 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4909 "Illegal vector");
4910
4911 const int NumLegalVectors = *LT.first.getValue();
4912 assert(NumLegalVectors >= 0 && "Negative cost!");
4913
4914 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4915 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4916 if (Insert) {
4917 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4918 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4919 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4920 // For types we can insert directly, insertion into 128-bit sub vectors is
4921 // cheap, followed by a cheap chain of concatenations.
4922 if (LegalVectorBitWidth <= LaneBitWidth) {
4923 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4924 /*Extract*/ false, CostKind);
4925 } else {
4926 // In each 128-lane, if at least one index is demanded but not all
4927 // indices are demanded and this 128-lane is not the first 128-lane of
4928 // the legalized-vector, then this 128-lane needs a extracti128; If in
4929 // each 128-lane, there is at least one demanded index, this 128-lane
4930 // needs a inserti128.
4931
4932 // The following cases will help you build a better understanding:
4933 // Assume we insert several elements into a v8i32 vector in avx2,
4934 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4935 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4936 // inserti128.
4937 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4938 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4939 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4940 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4941 unsigned NumLegalElts =
4942 LT.second.getVectorNumElements() * NumLegalVectors;
4943 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4944 "Vector has been legalized to smaller element count");
4945 assert((NumLegalElts % NumLanesTotal) == 0 &&
4946 "Unexpected elts per lane");
4947 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4948
4949 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4950 auto *LaneTy =
4951 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4952
4953 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4954 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4955 NumEltsPerLane, NumEltsPerLane * I);
4956 if (LaneEltMask.isZero())
4957 continue;
4958 // FIXME: we don't need to extract if all non-demanded elements
4959 // are legalization-inserted padding.
4960 if (!LaneEltMask.isAllOnes())
4962 I * NumEltsPerLane, LaneTy);
4963 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4964 /*Extract*/ false, CostKind);
4965 }
4966
4967 APInt AffectedLanes =
4968 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4969 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4970 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4971 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4972 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4973 unsigned I = NumLegalLanes * LegalVec + Lane;
4974 // No need to insert unaffected lane; or lane 0 of each legal vector
4975 // iff ALL lanes of that vector were affected and will be inserted.
4976 if (!AffectedLanes[I] ||
4977 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4978 continue;
4980 I * NumEltsPerLane, LaneTy);
4981 }
4982 }
4983 }
4984 } else if (LT.second.isVector()) {
4985 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4986 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4987 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4988 // considered cheap.
4989 if (Ty->isIntOrIntVectorTy())
4990 Cost += DemandedElts.popcount();
4991
4992 // Get the smaller of the legalized or original pow2-extended number of
4993 // vector elements, which represents the number of unpacks we'll end up
4994 // performing.
4995 unsigned NumElts = LT.second.getVectorNumElements();
4996 unsigned Pow2Elts =
4997 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4998 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4999 }
5000 }
5001
5002 if (Extract) {
5003 // vXi1 can be efficiently extracted with MOVMSK.
5004 // TODO: AVX512 predicate mask handling.
5005 // NOTE: This doesn't work well for roundtrip scalarization.
5006 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5007 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5008 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5009 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5010 return MOVMSKCost;
5011 }
5012
5013 if (LT.second.isVector()) {
5014 unsigned NumLegalElts =
5015 LT.second.getVectorNumElements() * NumLegalVectors;
5016 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5017 "Vector has been legalized to smaller element count");
5018
5019 // If we're extracting elements from a 128-bit subvector lane,
5020 // we only need to extract each lane once, not for every element.
5021 if (LegalVectorBitWidth > LaneBitWidth) {
5022 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5023 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5024 assert((NumLegalElts % NumLanesTotal) == 0 &&
5025 "Unexpected elts per lane");
5026 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5027
5028 // Add cost for each demanded 128-bit subvector extraction.
5029 // Luckily this is a lot easier than for insertion.
5030 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5031 auto *LaneTy =
5032 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5033
5034 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5035 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5036 NumEltsPerLane, I * NumEltsPerLane);
5037 if (LaneEltMask.isZero())
5038 continue;
5040 I * NumEltsPerLane, LaneTy);
5042 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5043 }
5044
5045 return Cost;
5046 }
5047 }
5048
5049 // Fallback to default extraction.
5050 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5051 Extract, CostKind);
5052 }
5053
5054 return Cost;
5055}
5056
5058X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5059 int VF, const APInt &DemandedDstElts,
5061 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5062 // We don't differentiate element types here, only element bit width.
5063 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5064
5065 auto bailout = [&]() {
5066 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5067 DemandedDstElts, CostKind);
5068 };
5069
5070 // For now, only deal with AVX512 cases.
5071 if (!ST->hasAVX512())
5072 return bailout();
5073
5074 // Do we have a native shuffle for this element type, or should we promote?
5075 unsigned PromEltTyBits = EltTyBits;
5076 switch (EltTyBits) {
5077 case 32:
5078 case 64:
5079 break; // AVX512F.
5080 case 16:
5081 if (!ST->hasBWI())
5082 PromEltTyBits = 32; // promote to i32, AVX512F.
5083 break; // AVX512BW
5084 case 8:
5085 if (!ST->hasVBMI())
5086 PromEltTyBits = 32; // promote to i32, AVX512F.
5087 break; // AVX512VBMI
5088 case 1:
5089 // There is no support for shuffling i1 elements. We *must* promote.
5090 if (ST->hasBWI()) {
5091 if (ST->hasVBMI())
5092 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5093 else
5094 PromEltTyBits = 16; // promote to i16, AVX512BW.
5095 break;
5096 }
5097 PromEltTyBits = 32; // promote to i32, AVX512F.
5098 break;
5099 default:
5100 return bailout();
5101 }
5102 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5103
5104 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5105 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5106
5107 int NumDstElements = VF * ReplicationFactor;
5108 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5109 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5110
5111 // Legalize the types.
5112 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5113 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5114 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5115 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5116 // They should have legalized into vector types.
5117 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5118 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5119 return bailout();
5120
5121 if (PromEltTyBits != EltTyBits) {
5122 // If we have to perform the shuffle with wider elt type than our data type,
5123 // then we will first need to anyext (we don't care about the new bits)
5124 // the source elements, and then truncate Dst elements.
5125 InstructionCost PromotionCost;
5126 PromotionCost += getCastInstrCost(
5127 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5129 PromotionCost +=
5130 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5131 /*Src=*/PromDstVecTy,
5133 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5134 ReplicationFactor, VF,
5135 DemandedDstElts, CostKind);
5136 }
5137
5138 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5139 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5140 "We expect that the legalization doesn't affect the element width, "
5141 "doesn't coalesce/split elements.");
5142
5143 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5144 unsigned NumDstVectors =
5145 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5146
5147 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5148
5149 // Not all the produced Dst elements may be demanded. In our case,
5150 // given that a single Dst vector is formed by a single shuffle,
5151 // if all elements that will form a single Dst vector aren't demanded,
5152 // then we won't need to do that shuffle, so adjust the cost accordingly.
5153 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5154 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5155 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5156
5157 InstructionCost SingleShuffleCost = getShuffleCost(
5158 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
5159 /*Index=*/0, /*SubTp=*/nullptr);
5160 return NumDstVectorsDemanded * SingleShuffleCost;
5161}
5162
5164 MaybeAlign Alignment,
5165 unsigned AddressSpace,
5167 TTI::OperandValueInfo OpInfo,
5168 const Instruction *I) {
5169 // TODO: Handle other cost kinds.
5171 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5172 // Store instruction with index and scale costs 2 Uops.
5173 // Check the preceding GEP to identify non-const indices.
5174 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5175 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5176 return TTI::TCC_Basic * 2;
5177 }
5178 }
5179 return TTI::TCC_Basic;
5180 }
5181
5182 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5183 "Invalid Opcode");
5184 // Type legalization can't handle structs
5185 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5186 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5187 CostKind, OpInfo, I);
5188
5189 // Legalize the type.
5190 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5191
5192 auto *VTy = dyn_cast<FixedVectorType>(Src);
5193
5195
5196 // Add a cost for constant load to vector.
5197 if (Opcode == Instruction::Store && OpInfo.isConstant())
5198 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5199 /*AddressSpace=*/0, CostKind, OpInfo);
5200
5201 // Handle the simple case of non-vectors.
5202 // NOTE: this assumes that legalization never creates vector from scalars!
5203 if (!VTy || !LT.second.isVector()) {
5204 // Each load/store unit costs 1.
5205 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5206 }
5207
5208 bool IsLoad = Opcode == Instruction::Load;
5209
5210 Type *EltTy = VTy->getElementType();
5211
5212 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5213
5214 // Source of truth: how many elements were there in the original IR vector?
5215 const unsigned SrcNumElt = VTy->getNumElements();
5216
5217 // How far have we gotten?
5218 int NumEltRemaining = SrcNumElt;
5219 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5220 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5221
5222 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5223
5224 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5225 const unsigned XMMBits = 128;
5226 if (XMMBits % EltTyBits != 0)
5227 // Vector size must be a multiple of the element size. I.e. no padding.
5228 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5229 CostKind, OpInfo, I);
5230 const int NumEltPerXMM = XMMBits / EltTyBits;
5231
5232 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5233
5234 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5235 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5236 // How many elements would a single op deal with at once?
5237 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5238 // Vector size must be a multiple of the element size. I.e. no padding.
5239 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5240 CostKind, OpInfo, I);
5241 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5242
5243 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5244 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5245 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5246 "Unless we haven't halved the op size yet, "
5247 "we have less than two op's sized units of work left.");
5248
5249 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5250 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5251 : XMMVecTy;
5252
5253 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5254 "After halving sizes, the vector elt count is no longer a multiple "
5255 "of number of elements per operation?");
5256 auto *CoalescedVecTy =
5257 CurrNumEltPerOp == 1
5258 ? CurrVecTy
5260 IntegerType::get(Src->getContext(),
5261 EltTyBits * CurrNumEltPerOp),
5262 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5263 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5264 DL.getTypeSizeInBits(CurrVecTy) &&
5265 "coalesciing elements doesn't change vector width.");
5266
5267 while (NumEltRemaining > 0) {
5268 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5269
5270 // Can we use this vector size, as per the remaining element count?
5271 // Iff the vector is naturally aligned, we can do a wide load regardless.
5272 if (NumEltRemaining < CurrNumEltPerOp &&
5273 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5274 CurrOpSizeBytes != 1)
5275 break; // Try smalled vector size.
5276
5277 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5278 // as a proxy for a double-pumped AVX memory interface such as on
5279 // Sandybridge.
5280 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5281 // will be scalarized.
5282 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5283 Cost += 2;
5284 else if (CurrOpSizeBytes < 4)
5285 Cost += 2;
5286 else
5287 Cost += 1;
5288
5289 // If we're loading a uniform value, then we don't need to split the load,
5290 // loading just a single (widest) vector can be reused by all splits.
5291 if (IsLoad && OpInfo.isUniform())
5292 return Cost;
5293
5294 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5295
5296 // If we have fully processed the previous reg, we need to replenish it.
5297 if (SubVecEltsLeft == 0) {
5298 SubVecEltsLeft += CurrVecTy->getNumElements();
5299 // And that's free only for the 0'th subvector of a legalized vector.
5300 if (!Is0thSubVec)
5303 VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5304 }
5305
5306 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5307 // for smaller widths (32/16/8) we have to insert/extract them separately.
5308 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5309 // but let's pretend that it is also true for 16/8 bit wide ops...)
5310 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5311 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5312 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5313 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5314 APInt DemandedElts =
5315 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5316 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5317 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5318 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5319 !IsLoad, CostKind);
5320 }
5321
5322 SubVecEltsLeft -= CurrNumEltPerOp;
5323 NumEltRemaining -= CurrNumEltPerOp;
5324 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5325 }
5326 }
5327
5328 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5329
5330 return Cost;
5331}
5332
5334X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5335 unsigned AddressSpace,
5337 bool IsLoad = (Instruction::Load == Opcode);
5338 bool IsStore = (Instruction::Store == Opcode);
5339
5340 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5341 if (!SrcVTy)
5342 // To calculate scalar take the regular cost, without mask
5343 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5344
5345 unsigned NumElem = SrcVTy->getNumElements();
5346 auto *MaskTy =
5347 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5348 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5349 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5350 // Scalarization
5351 APInt DemandedElts = APInt::getAllOnes(NumElem);
5353 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5354 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5355 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5357 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5358 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5360 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5361 InstructionCost MemopCost =
5362 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5363 Alignment, AddressSpace, CostKind);
5364 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5365 }
5366
5367 // Legalize the type.
5368 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5369 auto VT = TLI->getValueType(DL, SrcVTy);
5371 MVT Ty = LT.second;
5372 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5373 // APX masked load/store for scalar is cheap.
5374 return Cost + LT.first;
5375
5376 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5377 LT.second.getVectorNumElements() == NumElem)
5378 // Promotion requires extend/truncate for data and a shuffle for mask.
5379 Cost +=
5381 nullptr) +
5382 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
5383
5384 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5385 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5387 // Expanding requires fill mask with zeroes
5389 MaskTy);
5390 }
5391
5392 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5393 if (!ST->hasAVX512())
5394 return Cost + LT.first * (IsLoad ? 2 : 8);
5395
5396 // AVX-512 masked load/store is cheaper
5397 return Cost + LT.first;
5398}
5399
5402 const Value *Base,
5403 const TTI::PointersChainInfo &Info,
5404 Type *AccessTy, TTI::TargetCostKind CostKind) {
5405 if (Info.isSameBase() && Info.isKnownStride()) {
5406 // If all the pointers have known stride all the differences are translated
5407 // into constants. X86 memory addressing allows encoding it into
5408 // displacement. So we just need to take the base GEP cost.
5409 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5410 SmallVector<const Value *> Indices(BaseGEP->indices());
5411 return getGEPCost(BaseGEP->getSourceElementType(),
5412 BaseGEP->getPointerOperand(), Indices, nullptr,
5413 CostKind);
5414 }
5415 return TTI::TCC_Free;
5416 }
5417 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5418}
5419
5421 ScalarEvolution *SE,
5422 const SCEV *Ptr) {
5423 // Address computations in vectorized code with non-consecutive addresses will
5424 // likely result in more instructions compared to scalar code where the
5425 // computation can more often be merged into the index mode. The resulting
5426 // extra micro-ops can significantly decrease throughput.
5427 const unsigned NumVectorInstToHideOverhead = 10;
5428
5429 // Cost modeling of Strided Access Computation is hidden by the indexing
5430 // modes of X86 regardless of the stride value. We dont believe that there
5431 // is a difference between constant strided access in gerenal and constant
5432 // strided value which is less than or equal to 64.
5433 // Even in the case of (loop invariant) stride whose value is not known at
5434 // compile time, the address computation will not incur more than one extra
5435 // ADD instruction.
5436 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5437 // TODO: AVX2 is the current cut-off because we don't have correct
5438 // interleaving costs for prior ISA's.
5440 return NumVectorInstToHideOverhead;
5442 return 1;
5443 }
5444
5445 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5446}
5447
5450 std::optional<FastMathFlags> FMF,
5453 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5454
5455 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5456 // and make it as the cost.
5457
5458 static const CostTblEntry SLMCostTbl[] = {
5459 { ISD::FADD, MVT::v2f64, 3 },
5460 { ISD::ADD, MVT::v2i64, 5 },
5461 };
5462
5463 static const CostTblEntry SSE2CostTbl[] = {
5464 { ISD::FADD, MVT::v2f64, 2 },
5465 { ISD::FADD, MVT::v2f32, 2 },
5466 { ISD::FADD, MVT::v4f32, 4 },
5467 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5468 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5469 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5470 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5471 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5472 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5473 { ISD::ADD, MVT::v2i8, 2 },
5474 { ISD::ADD, MVT::v4i8, 2 },
5475 { ISD::ADD, MVT::v8i8, 2 },
5476 { ISD::ADD, MVT::v16i8, 3 },
5477 };
5478
5479 static const CostTblEntry AVX1CostTbl[] = {
5480 { ISD::FADD, MVT::v4f64, 3 },
5481 { ISD::FADD, MVT::v4f32, 3 },
5482 { ISD::FADD, MVT::v8f32, 4 },
5483 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5484 { ISD::ADD, MVT::v4i64, 3 },
5485 { ISD::ADD, MVT::v8i32, 5 },
5486 { ISD::ADD, MVT::v16i16, 5 },
5487 { ISD::ADD, MVT::v32i8, 4 },
5488 };
5489
5490 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5491 assert(ISD && "Invalid opcode");
5492
5493 // Before legalizing the type, give a chance to look up illegal narrow types
5494 // in the table.
5495 // FIXME: Is there a better way to do this?
5496 EVT VT = TLI->getValueType(DL, ValTy);
5497 if (VT.isSimple()) {
5498 MVT MTy = VT.getSimpleVT();
5499 if (ST->useSLMArithCosts())
5500 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5501 return Entry->Cost;
5502
5503 if (ST->hasAVX())
5504 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5505 return Entry->Cost;
5506
5507 if (ST->hasSSE2())
5508 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5509 return Entry->Cost;
5510 }
5511
5512 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5513
5514 MVT MTy = LT.second;
5515
5516 auto *ValVTy = cast<FixedVectorType>(ValTy);
5517
5518 // Special case: vXi8 mul reductions are performed as vXi16.
5519 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5520 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5521 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5522 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5524 CostKind) +
5525 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5526 }
5527
5528 InstructionCost ArithmeticCost = 0;
5529 if (LT.first != 1 && MTy.isVector() &&
5530 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5531 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5532 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5533 MTy.getVectorNumElements());
5534 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5535 ArithmeticCost *= LT.first - 1;
5536 }
5537
5538 if (ST->useSLMArithCosts())
5539 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5540 return ArithmeticCost + Entry->Cost;
5541
5542 if (ST->hasAVX())
5543 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5544 return ArithmeticCost + Entry->Cost;
5545
5546 if (ST->hasSSE2())
5547 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5548 return ArithmeticCost + Entry->Cost;
5549
5550 // FIXME: These assume a naive kshift+binop lowering, which is probably
5551 // conservative in most cases.
5552 static const CostTblEntry AVX512BoolReduction[] = {
5553 { ISD::AND, MVT::v2i1, 3 },
5554 { ISD::AND, MVT::v4i1, 5 },
5555 { ISD::AND, MVT::v8i1, 7 },
5556 { ISD::AND, MVT::v16i1, 9 },
5557 { ISD::AND, MVT::v32i1, 11 },
5558 { ISD::AND, MVT::v64i1, 13 },
5559 { ISD::OR, MVT::v2i1, 3 },
5560 { ISD::OR, MVT::v4i1, 5 },
5561 { ISD::OR, MVT::v8i1, 7 },
5562 { ISD::OR, MVT::v16i1, 9 },
5563 { ISD::OR, MVT::v32i1, 11 },
5564 { ISD::OR, MVT::v64i1, 13 },
5565 };
5566
5567 static const CostTblEntry AVX2BoolReduction[] = {
5568 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5569 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5570 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5571 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5572 };
5573
5574 static const CostTblEntry AVX1BoolReduction[] = {
5575 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5576 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5577 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5578 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5579 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5580 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5581 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5582 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5583 };
5584
5585 static const CostTblEntry SSE2BoolReduction[] = {
5586 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5587 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5588 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5589 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5590 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5591 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5592 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5593 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5594 };
5595
5596 // Handle bool allof/anyof patterns.
5597 if (ValVTy->getElementType()->isIntegerTy(1)) {
5598 InstructionCost ArithmeticCost = 0;
5599 if (LT.first != 1 && MTy.isVector() &&
5600 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5601 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5602 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5603 MTy.getVectorNumElements());
5604 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5605 ArithmeticCost *= LT.first - 1;
5606 }
5607
5608 if (ST->hasAVX512())
5609 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5610 return ArithmeticCost + Entry->Cost;
5611 if (ST->hasAVX2())
5612 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5613 return ArithmeticCost + Entry->Cost;
5614 if (ST->hasAVX())
5615 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5616 return ArithmeticCost + Entry->Cost;
5617 if (ST->hasSSE2())
5618 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5619 return ArithmeticCost + Entry->Cost;
5620
5621 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5622 }
5623
5624 unsigned NumVecElts = ValVTy->getNumElements();
5625 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5626
5627 // Special case power of 2 reductions where the scalar type isn't changed
5628 // by type legalization.
5629 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5630 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5631
5632 InstructionCost ReductionCost = 0;
5633
5634 auto *Ty = ValVTy;
5635 if (LT.first != 1 && MTy.isVector() &&
5636 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5637 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5638 Ty = FixedVectorType::get(ValVTy->getElementType(),
5639 MTy.getVectorNumElements());
5640 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5641 ReductionCost *= LT.first - 1;
5642 NumVecElts = MTy.getVectorNumElements();
5643 }
5644
5645 // Now handle reduction with the legal type, taking into account size changes
5646 // at each level.
5647 while (NumVecElts > 1) {
5648 // Determine the size of the remaining vector we need to reduce.
5649 unsigned Size = NumVecElts * ScalarSize;
5650 NumVecElts /= 2;
5651 // If we're reducing from 256/512 bits, use an extract_subvector.
5652 if (Size > 128) {
5653 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5654 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
5655 CostKind, NumVecElts, SubTy);
5656 Ty = SubTy;
5657 } else if (Size == 128) {
5658 // Reducing from 128 bits is a permute of v2f64/v2i64.
5659 FixedVectorType *ShufTy;
5660 if (ValVTy->isFloatingPointTy())
5661 ShufTy =
5662 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5663 else
5664 ShufTy =
5665 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5666 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5667 CostKind, 0, nullptr);
5668 } else if (Size == 64) {
5669 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5670 FixedVectorType *ShufTy;
5671 if (ValVTy->isFloatingPointTy())
5672 ShufTy =
5673 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5674 else
5675 ShufTy =
5676 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5677 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5678 CostKind, 0, nullptr);
5679 } else {
5680 // Reducing from smaller size is a shift by immediate.
5681 auto *ShiftTy = FixedVectorType::get(
5682 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5683 ReductionCost += getArithmeticInstrCost(
5684 Instruction::LShr, ShiftTy, CostKind,
5687 }
5688
5689 // Add the arithmetic op for this level.
5690 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5691 }
5692
5693 // Add the final extract element to the cost.
5694 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5695 CostKind, 0, nullptr, nullptr);
5696}
5697
5700 FastMathFlags FMF) {
5701 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5702 return getIntrinsicInstrCost(ICA, CostKind);
5703}
5704
5707 FastMathFlags FMF,
5709 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5710
5711 MVT MTy = LT.second;
5712
5713 int ISD;
5714 if (ValTy->isIntOrIntVectorTy()) {
5715 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5716 : ISD::SMIN;
5717 } else {
5718 assert(ValTy->isFPOrFPVectorTy() &&
5719 "Expected float point or integer vector type.");
5720 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5721 ? ISD::FMINNUM
5722 : ISD::FMINIMUM;
5723 }
5724
5725 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5726 // and make it as the cost.
5727
5728 static const CostTblEntry SSE2CostTbl[] = {
5729 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5730 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5731 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5732 };
5733
5734 static const CostTblEntry SSE41CostTbl[] = {
5735 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5736 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5737 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5738 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5739 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5740 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5741 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5742 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5743 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5744 {ISD::SMIN, MVT::v16i8, 6},
5745 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5746 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5747 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5748 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5749 };
5750
5751 static const CostTblEntry AVX1CostTbl[] = {
5752 {ISD::SMIN, MVT::v16i16, 6},
5753 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5754 {ISD::SMIN, MVT::v32i8, 8},
5755 {ISD::UMIN, MVT::v32i8, 8},
5756 };
5757
5758 static const CostTblEntry AVX512BWCostTbl[] = {
5759 {ISD::SMIN, MVT::v32i16, 8},
5760 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5761 {ISD::SMIN, MVT::v64i8, 10},
5762 {ISD::UMIN, MVT::v64i8, 10},
5763 };
5764
5765 // Before legalizing the type, give a chance to look up illegal narrow types
5766 // in the table.
5767 // FIXME: Is there a better way to do this?
5768 EVT VT = TLI->getValueType(DL, ValTy);
5769 if (VT.isSimple()) {
5770 MVT MTy = VT.getSimpleVT();
5771 if (ST->hasBWI())
5772 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5773 return Entry->Cost;
5774
5775 if (ST->hasAVX())
5776 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5777 return Entry->Cost;
5778
5779 if (ST->hasSSE41())
5780 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5781 return Entry->Cost;
5782
5783 if (ST->hasSSE2())
5784 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5785 return Entry->Cost;
5786 }
5787
5788 auto *ValVTy = cast<FixedVectorType>(ValTy);
5789 unsigned NumVecElts = ValVTy->getNumElements();
5790
5791 auto *Ty = ValVTy;
5792 InstructionCost MinMaxCost = 0;
5793 if (LT.first != 1 && MTy.isVector() &&
5794 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5795 // Type needs to be split. We need LT.first - 1 operations ops.
5796 Ty = FixedVectorType::get(ValVTy->getElementType(),
5797 MTy.getVectorNumElements());
5798 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5799 MinMaxCost *= LT.first - 1;
5800 NumVecElts = MTy.getVectorNumElements();
5801 }
5802
5803 if (ST->hasBWI())
5804 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5805 return MinMaxCost + Entry->Cost;
5806
5807 if (ST->hasAVX())
5808 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5809 return MinMaxCost + Entry->Cost;
5810
5811 if (ST->hasSSE41())
5812 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5813 return MinMaxCost + Entry->Cost;
5814
5815 if (ST->hasSSE2())
5816 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5817 return MinMaxCost + Entry->Cost;
5818
5819 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5820
5821 // Special case power of 2 reductions where the scalar type isn't changed
5822 // by type legalization.
5823 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5824 ScalarSize != MTy.getScalarSizeInBits())
5825 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5826
5827 // Now handle reduction with the legal type, taking into account size changes
5828 // at each level.
5829 while (NumVecElts > 1) {
5830 // Determine the size of the remaining vector we need to reduce.
5831 unsigned Size = NumVecElts * ScalarSize;
5832 NumVecElts /= 2;
5833 // If we're reducing from 256/512 bits, use an extract_subvector.
5834 if (Size > 128) {
5835 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5836 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5837 NumVecElts, SubTy);
5838 Ty = SubTy;
5839 } else if (Size == 128) {
5840 // Reducing from 128 bits is a permute of v2f64/v2i64.
5841 VectorType *ShufTy;
5842 if (ValTy->isFloatingPointTy())
5843 ShufTy =
5845 else
5846 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5847 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5848 CostKind, 0, nullptr);
5849 } else if (Size == 64) {
5850 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5851 FixedVectorType *ShufTy;
5852 if (ValTy->isFloatingPointTy())
5853 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5854 else
5855 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5856 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5857 CostKind, 0, nullptr);
5858 } else {
5859 // Reducing from smaller size is a shift by immediate.
5860 auto *ShiftTy = FixedVectorType::get(
5861 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5862 MinMaxCost += getArithmeticInstrCost(
5863 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5866 }
5867
5868 // Add the arithmetic op for this level.
5869 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5870 }
5871
5872 // Add the final extract element to the cost.
5873 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5874 CostKind, 0, nullptr, nullptr);
5875}
5876
5877/// Calculate the cost of materializing a 64-bit value. This helper
5878/// method might only calculate a fraction of a larger immediate. Therefore it
5879/// is valid to return a cost of ZERO.
5881 if (Val == 0)
5882 return TTI::TCC_Free;
5883
5884 if (isInt<32>(Val))
5885 return TTI::TCC_Basic;
5886
5887 return 2 * TTI::TCC_Basic;
5888}
5889
5892 assert(Ty->isIntegerTy());
5893
5894 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5895 if (BitSize == 0)
5896 return ~0U;
5897
5898 // Never hoist constants larger than 128bit, because this might lead to
5899 // incorrect code generation or assertions in codegen.
5900 // Fixme: Create a cost model for types larger than i128 once the codegen
5901 // issues have been fixed.
5902 if (BitSize > 128)
5903 return TTI::TCC_Free;
5904
5905 if (Imm == 0)
5906 return TTI::TCC_Free;
5907
5908 // Sign-extend all constants to a multiple of 64-bit.
5909 APInt ImmVal = Imm;
5910 if (BitSize % 64 != 0)
5911 ImmVal = Imm.sext(alignTo(BitSize, 64));
5912
5913 // Split the constant into 64-bit chunks and calculate the cost for each
5914 // chunk.
5916 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5917 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5918 int64_t Val = Tmp.getSExtValue();
5919 Cost += getIntImmCost(Val);
5920 }
5921 // We need at least one instruction to materialize the constant.
5922 return std::max<InstructionCost>(1, Cost);
5923}
5924
5926 const APInt &Imm, Type *Ty,
5928 Instruction *Inst) {
5929 assert(Ty->isIntegerTy());
5930
5931 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5932 unsigned ImmBitWidth = Imm.getBitWidth();
5933
5934 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5935 // here, so that constant hoisting will ignore this constant.
5936 if (BitSize == 0)
5937 return TTI::TCC_Free;
5938
5939 unsigned ImmIdx = ~0U;
5940 switch (Opcode) {
5941 default:
5942 return TTI::TCC_Free;
5943 case Instruction::GetElementPtr:
5944 // Always hoist the base address of a GetElementPtr. This prevents the
5945 // creation of new constants for every base constant that gets constant
5946 // folded with the offset.
5947 if (Idx == 0)
5948 return 2 * TTI::TCC_Basic;
5949 return TTI::TCC_Free;
5950 case Instruction::Store:
5951 ImmIdx = 0;
5952 break;
5953 case Instruction::ICmp:
5954 // This is an imperfect hack to prevent constant hoisting of
5955 // compares that might be trying to check if a 64-bit value fits in
5956 // 32-bits. The backend can optimize these cases using a right shift by 32.
5957 // Ideally we would check the compare predicate here. There also other
5958 // similar immediates the backend can use shifts for.
5959 if (Idx == 1 && ImmBitWidth == 64) {
5960 uint64_t ImmVal = Imm.getZExtValue();
5961 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5962 return TTI::TCC_Free;
5963 }
5964 ImmIdx = 1;
5965 break;
5966 case Instruction::And:
5967 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5968 // by using a 32-bit operation with implicit zero extension. Detect such
5969 // immediates here as the normal path expects bit 31 to be sign extended.
5970 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
5971 return TTI::TCC_Free;
5972 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
5973 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
5974 Imm.isMask())
5975 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
5976 ImmIdx = 1;
5977 break;
5978 case Instruction::Add:
5979 case Instruction::Sub:
5980 // For add/sub, we can use the opposite instruction for INT32_MIN.
5981 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
5982 return TTI::TCC_Free;
5983 ImmIdx = 1;
5984 break;
5985 case Instruction::UDiv:
5986 case Instruction::SDiv:
5987 case Instruction::URem:
5988 case Instruction::SRem:
5989 // Division by constant is typically expanded later into a different
5990 // instruction sequence. This completely changes the constants.
5991 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5992 return TTI::TCC_Free;
5993 case Instruction::Mul:
5994 case Instruction::Or:
5995 case Instruction::Xor:
5996 ImmIdx = 1;
5997 break;
5998 // Always return TCC_Free for the shift value of a shift instruction.
5999 case Instruction::Shl:
6000 case Instruction::LShr:
6001 case Instruction::AShr:
6002 if (Idx == 1)
6003 return TTI::TCC_Free;
6004 break;
6005 case Instruction::Trunc:
6006 case Instruction::ZExt:
6007 case Instruction::SExt:
6008 case Instruction::IntToPtr:
6009 case Instruction::PtrToInt:
6010 case Instruction::BitCast:
6011 case Instruction::PHI:
6012 case Instruction::Call:
6013 case Instruction::Select:
6014 case Instruction::Ret:
6015 case Instruction::Load:
6016 break;
6017 }
6018
6019 if (Idx == ImmIdx) {
6020 uint64_t NumConstants = divideCeil(BitSize, 64);
6022 return (Cost <= NumConstants * TTI::TCC_Basic)
6023 ? static_cast<int>(TTI::TCC_Free)
6024 : Cost;
6025 }
6026
6027 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6028}
6029
6031 const APInt &Imm, Type *Ty,
6033 assert(Ty->isIntegerTy());
6034
6035 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6036 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6037 // here, so that constant hoisting will ignore this constant.
6038 if (BitSize == 0)
6039 return TTI::TCC_Free;
6040
6041 switch (IID) {
6042 default:
6043 return TTI::TCC_Free;
6044 case Intrinsic::sadd_with_overflow:
6045 case Intrinsic::uadd_with_overflow:
6046 case Intrinsic::ssub_with_overflow:
6047 case Intrinsic::usub_with_overflow:
6048 case Intrinsic::smul_with_overflow:
6049 case Intrinsic::umul_with_overflow:
6050 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6051 return TTI::TCC_Free;
6052 break;
6053 case Intrinsic::experimental_stackmap:
6054 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6055 return TTI::TCC_Free;
6056 break;
6057 case Intrinsic::experimental_patchpoint_void:
6058 case Intrinsic::experimental_patchpoint:
6059 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6060 return TTI::TCC_Free;
6061 break;
6062 }
6063 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6064}
6065
6068 const Instruction *I) {
6070 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6071 // Branches are assumed to be predicted.
6072 return TTI::TCC_Free;
6073}
6074
6075int X86TTIImpl::getGatherOverhead() const {
6076 // Some CPUs have more overhead for gather. The specified overhead is relative
6077 // to the Load operation. "2" is the number provided by Intel architects. This
6078 // parameter is used for cost estimation of Gather Op and comparison with
6079 // other alternatives.
6080 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6081 // enable gather with a -march.
6082 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6083 return 2;
6084
6085 return 1024;
6086}
6087
6088int X86TTIImpl::getScatterOverhead() const {
6089 if (ST->hasAVX512())
6090 return 2;
6091
6092 return 1024;
6093}
6094
6095// Return an average cost of Gather / Scatter instruction, maybe improved later.
6096InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6098 Type *SrcVTy, const Value *Ptr,
6099 Align Alignment,
6100 unsigned AddressSpace) {
6101
6102 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6103 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6104
6105 // Try to reduce index size from 64 bit (default for GEP)
6106 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6107 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6108 // to split. Also check that the base pointer is the same for all lanes,
6109 // and that there's at most one variable index.
6110 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6111 unsigned IndexSize = DL.getPointerSizeInBits();
6112 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6113 if (IndexSize < 64 || !GEP)
6114 return IndexSize;
6115
6116 unsigned NumOfVarIndices = 0;
6117 const Value *Ptrs = GEP->getPointerOperand();
6118 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6119 return IndexSize;
6120 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6121 if (isa<Constant>(GEP->getOperand(I)))
6122 continue;
6123 Type *IndxTy = GEP->getOperand(I)->getType();
6124 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6125 IndxTy = IndexVTy->getElementType();
6126 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6127 !isa<SExtInst>(GEP->getOperand(I))) ||
6128 ++NumOfVarIndices > 1)
6129 return IndexSize; // 64
6130 }
6131 return (unsigned)32;
6132 };
6133
6134 // Trying to reduce IndexSize to 32 bits for vector 16.
6135 // By default the IndexSize is equal to pointer size.
6136 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6137 ? getIndexSizeInBits(Ptr, DL)
6139
6140 auto *IndexVTy = FixedVectorType::get(
6141 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6142 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6143 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6144 InstructionCost::CostType SplitFactor =
6145 *std::max(IdxsLT.first, SrcLT.first).getValue();
6146 if (SplitFactor > 1) {
6147 // Handle splitting of vector of pointers
6148 auto *SplitSrcTy =
6149 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6150 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6151 Alignment, AddressSpace);
6152 }
6153
6154 // If we didn't split, this will be a single gather/scatter instruction.
6156 return 1;
6157
6158 // The gather / scatter cost is given by Intel architects. It is a rough
6159 // number since we are looking at one instruction in a time.
6160 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6161 : getScatterOverhead();
6162 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6163 MaybeAlign(Alignment), AddressSpace,
6164 CostKind);
6165}
6166
6167/// Calculate the cost of Gather / Scatter operation
6169 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6171 const Instruction *I = nullptr) {
6172 if ((Opcode == Instruction::Load &&
6173 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6174 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6175 Align(Alignment)))) ||
6176 (Opcode == Instruction::Store &&
6177 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6178 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6179 Align(Alignment)))))
6180 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6181 Alignment, CostKind, I);
6182
6183 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6184 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6185 if (!PtrTy && Ptr->getType()->isVectorTy())
6186 PtrTy = dyn_cast<PointerType>(
6187 cast<VectorType>(Ptr->getType())->getElementType());
6188 assert(PtrTy && "Unexpected type for Ptr argument");
6189 unsigned AddressSpace = PtrTy->getAddressSpace();
6190 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6191 AddressSpace);
6192}
6193
6195 const TargetTransformInfo::LSRCost &C2) {
6196 // X86 specific here are "instruction number 1st priority".
6197 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6198 C1.NumIVMuls, C1.NumBaseAdds,
6199 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6200 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6201 C2.NumIVMuls, C2.NumBaseAdds,
6202 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6203}
6204
6206 return ST->hasMacroFusion() || ST->hasBranchFusion();
6207}
6208
6209bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6210 Type *ScalarTy = DataTy->getScalarType();
6211
6212 // The backend can't handle a single element vector w/o CFCMOV.
6213 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6214 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6215
6216 if (!ST->hasAVX())
6217 return false;
6218
6219 if (ScalarTy->isPointerTy())
6220 return true;
6221
6222 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6223 return true;
6224
6225 if (ScalarTy->isHalfTy() && ST->hasBWI())
6226 return true;
6227
6228 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6229 return true;
6230
6231 if (!ScalarTy->isIntegerTy())
6232 return false;
6233
6234 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6235 return IntWidth == 32 || IntWidth == 64 ||
6236 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6237}
6238
6239bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6240 return isLegalMaskedLoad(DataType, Alignment);
6241}
6242
6243bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6244 unsigned DataSize = DL.getTypeStoreSize(DataType);
6245 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6246 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6247 // (the equivalent stores only require AVX).
6248 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6249 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6250
6251 return false;
6252}
6253
6254bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6255 unsigned DataSize = DL.getTypeStoreSize(DataType);
6256
6257 // SSE4A supports nontemporal stores of float and double at arbitrary
6258 // alignment.
6259 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6260 return true;
6261
6262 // Besides the SSE4A subtarget exception above, only aligned stores are
6263 // available nontemporaly on any other subtarget. And only stores with a size
6264 // of 4..32 bytes (powers of 2, only) are permitted.
6265 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6266 !isPowerOf2_32(DataSize))
6267 return false;
6268
6269 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6270 // loads require AVX2).
6271 if (DataSize == 32)
6272 return ST->hasAVX();
6273 if (DataSize == 16)
6274 return ST->hasSSE1();
6275 return true;
6276}
6277
6279 ElementCount NumElements) const {
6280 // movddup
6281 return ST->hasSSE3() && !NumElements.isScalable() &&
6282 NumElements.getFixedValue() == 2 &&
6283 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6284}
6285
6287 if (!isa<VectorType>(DataTy))
6288 return false;
6289
6290 if (!ST->hasAVX512())
6291 return false;
6292
6293 // The backend can't handle a single element vector.
6294 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6295 return false;
6296
6297 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6298
6299 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6300 return true;
6301
6302 if (!ScalarTy->isIntegerTy())
6303 return false;
6304
6305 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6306 return IntWidth == 32 || IntWidth == 64 ||
6307 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6308}
6309
6311 return isLegalMaskedExpandLoad(DataTy, Alignment);
6312}
6313
6314bool X86TTIImpl::supportsGather() const {
6315 // Some CPUs have better gather performance than others.
6316 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6317 // enable gather with a -march.
6318 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6319}
6320
6322 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6323 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6324 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6325 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6326 // Check, maybe the gather/scatter instruction is better in the VariableMask
6327 // case.
6328 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6329 return NumElts == 1 ||
6330 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6331}
6332
6334 Type *ScalarTy = DataTy->getScalarType();
6335 if (ScalarTy->isPointerTy())
6336 return true;
6337
6338 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6339 return true;
6340
6341 if (!ScalarTy->isIntegerTy())
6342 return false;
6343
6344 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6345 return IntWidth == 32 || IntWidth == 64;
6346}
6347
6349 if (!supportsGather() || !ST->preferGather())
6350 return false;
6351 return isLegalMaskedGatherScatter(DataTy, Alignment);
6352}
6353
6354bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6355 unsigned Opcode1,
6356 const SmallBitVector &OpcodeMask) const {
6357 // ADDSUBPS 4xf32 SSE3
6358 // VADDSUBPS 4xf32 AVX
6359 // VADDSUBPS 8xf32 AVX2
6360 // ADDSUBPD 2xf64 SSE3
6361 // VADDSUBPD 2xf64 AVX
6362 // VADDSUBPD 4xf64 AVX2
6363
6364 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6365 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6366 if (!isPowerOf2_32(NumElements))
6367 return false;
6368 // Check the opcode pattern. We apply the mask on the opcode arguments and
6369 // then check if it is what we expect.
6370 for (int Lane : seq<int>(0, NumElements)) {
6371 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6372 // We expect FSub for even lanes and FAdd for odd lanes.
6373 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6374 return false;
6375 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6376 return false;
6377 }
6378 // Now check that the pattern is supported by the target ISA.
6379 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6380 if (ElemTy->isFloatTy())
6381 return ST->hasSSE3() && NumElements % 4 == 0;
6382 if (ElemTy->isDoubleTy())
6383 return ST->hasSSE3() && NumElements % 2 == 0;
6384 return false;
6385}
6386
6387bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6388 // AVX2 doesn't support scatter
6389 if (!ST->hasAVX512() || !ST->preferScatter())
6390 return false;
6391 return isLegalMaskedGatherScatter(DataType, Alignment);
6392}
6393
6394bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6395 EVT VT = TLI->getValueType(DL, DataType);
6396 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6397}
6398
6400 // FDIV is always expensive, even if it has a very low uop count.
6401 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6402 if (I->getOpcode() == Instruction::FDiv)
6403 return true;
6404
6406}
6407
6409 return false;
6410}
6411
6413 const Function *Callee) const {
6414 const TargetMachine &TM = getTLI()->getTargetMachine();
6415
6416 // Work this as a subsetting of subtarget features.
6417 const FeatureBitset &CallerBits =
6418 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6419 const FeatureBitset &CalleeBits =
6420 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6421
6422 // Check whether features are the same (apart from the ignore list).
6423 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6424 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6425 if (RealCallerBits == RealCalleeBits)
6426 return true;
6427
6428 // If the features are a subset, we need to additionally check for calls
6429 // that may become ABI-incompatible as a result of inlining.
6430 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6431 return false;
6432
6433 for (const Instruction &I : instructions(Callee)) {
6434 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6435 // Having more target features is fine for inline ASM.
6436 if (CB->isInlineAsm())
6437 continue;
6438
6440 for (Value *Arg : CB->args())
6441 Types.push_back(Arg->getType());
6442 if (!CB->getType()->isVoidTy())
6443 Types.push_back(CB->getType());
6444
6445 // Simple types are always ABI compatible.
6446 auto IsSimpleTy = [](Type *Ty) {
6447 return !Ty->isVectorTy() && !Ty->isAggregateType();
6448 };
6449 if (all_of(Types, IsSimpleTy))
6450 continue;
6451
6452 if (Function *NestedCallee = CB->getCalledFunction()) {
6453 // Assume that intrinsics are always ABI compatible.
6454 if (NestedCallee->isIntrinsic())
6455 continue;
6456
6457 // Do a precise compatibility check.
6458 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6459 return false;
6460 } else {
6461 // We don't know the target features of the callee,
6462 // assume it is incompatible.
6463 return false;
6464 }
6465 }
6466 }
6467 return true;
6468}
6469
6471 const Function *Callee,
6472 const ArrayRef<Type *> &Types) const {
6473 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6474 return false;
6475
6476 // If we get here, we know the target features match. If one function
6477 // considers 512-bit vectors legal and the other does not, consider them
6478 // incompatible.
6479 const TargetMachine &TM = getTLI()->getTargetMachine();
6480
6481 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6482 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6483 return true;
6484
6485 // Consider the arguments compatible if they aren't vectors or aggregates.
6486 // FIXME: Look at the size of vectors.
6487 // FIXME: Look at the element types of aggregates to see if there are vectors.
6488 return llvm::none_of(Types,
6489 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6490}
6491
6493X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6495 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6496 Options.NumLoadsPerBlock = 2;
6497 // All GPR and vector loads can be unaligned.
6498 Options.AllowOverlappingLoads = true;
6499 if (IsZeroCmp) {
6500 // Only enable vector loads for equality comparison. Right now the vector
6501 // version is not as fast for three way compare (see #33329).
6502 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6503 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6504 Options.LoadSizes.push_back(64);
6505 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6506 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6507 }
6508 if (ST->is64Bit()) {
6509 Options.LoadSizes.push_back(8);
6510 }
6511 Options.LoadSizes.push_back(4);
6512 Options.LoadSizes.push_back(2);
6513 Options.LoadSizes.push_back(1);
6514 return Options;
6515}
6516
6518 return supportsGather();
6519}
6520
6522 return false;
6523}
6524
6526 // TODO: We expect this to be beneficial regardless of arch,
6527 // but there are currently some unexplained performance artifacts on Atom.
6528 // As a temporary solution, disable on Atom.
6529 return !(ST->isAtom());
6530}
6531
6532// Get estimation for interleaved load/store operations and strided load.
6533// \p Indices contains indices for strided load.
6534// \p Factor - the factor of interleaving.
6535// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6537 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6538 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6539 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6540 // VecTy for interleave memop is <VF*Factor x Elt>.
6541 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6542 // VecTy = <12 x i32>.
6543
6544 // Calculate the number of memory operations (NumOfMemOps), required
6545 // for load/store the VecTy.
6546 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6547 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6548 unsigned LegalVTSize = LegalVT.getStoreSize();
6549 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6550
6551 // Get the cost of one memory operation.
6552 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6553 LegalVT.getVectorNumElements());
6554 InstructionCost MemOpCost;
6555 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6556 if (UseMaskedMemOp)
6557 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6559 else
6560 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6562
6563 unsigned VF = VecTy->getNumElements() / Factor;
6564 MVT VT =
6566
6567 InstructionCost MaskCost;
6568 if (UseMaskedMemOp) {
6569 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6570 for (unsigned Index : Indices) {
6571 assert(Index < Factor && "Invalid index for interleaved memory op");
6572 for (unsigned Elm = 0; Elm < VF; Elm++)
6573 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6574 }
6575
6576 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6577
6578 MaskCost = getReplicationShuffleCost(
6579 I1Type, Factor, VF,
6580 UseMaskForGaps ? DemandedLoadStoreElts
6582 CostKind);
6583
6584 // The Gaps mask is invariant and created outside the loop, therefore the
6585 // cost of creating it is not accounted for here. However if we have both
6586 // a MaskForGaps and some other mask that guards the execution of the
6587 // memory access, we need to account for the cost of And-ing the two masks
6588 // inside the loop.
6589 if (UseMaskForGaps) {
6590 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6591 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6592 }
6593 }
6594
6595 if (Opcode == Instruction::Load) {
6596 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6597 // contain the cost of the optimized shuffle sequence that the
6598 // X86InterleavedAccess pass will generate.
6599 // The cost of loads and stores are computed separately from the table.
6600
6601 // X86InterleavedAccess support only the following interleaved-access group.
6602 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6603 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6604 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6605 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6606 };
6607
6608 if (const auto *Entry =
6609 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6610 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6611 //If an entry does not exist, fallback to the default implementation.
6612
6613 // Kind of shuffle depends on number of loaded values.
6614 // If we load the entire data in one register, we can use a 1-src shuffle.
6615 // Otherwise, we'll merge 2 sources in each operation.
6616 TTI::ShuffleKind ShuffleKind =
6617 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6618
6619 InstructionCost ShuffleCost =
6620 getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
6621
6622 unsigned NumOfLoadsInInterleaveGrp =
6623 Indices.size() ? Indices.size() : Factor;
6624 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6625 VecTy->getNumElements() / Factor);
6626 InstructionCost NumOfResults =
6627 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6628
6629 // About a half of the loads may be folded in shuffles when we have only
6630 // one result. If we have more than one result, or the loads are masked,
6631 // we do not fold loads at all.
6632 unsigned NumOfUnfoldedLoads =
6633 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6634
6635 // Get a number of shuffle operations per result.
6636 unsigned NumOfShufflesPerResult =
6637 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6638
6639 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6640 // When we have more than one destination, we need additional instructions
6641 // to keep sources.
6642 InstructionCost NumOfMoves = 0;
6643 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6644 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6645
6646 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6647 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6648 NumOfMoves;
6649
6650 return Cost;
6651 }
6652
6653 // Store.
6654 assert(Opcode == Instruction::Store &&
6655 "Expected Store Instruction at this point");
6656 // X86InterleavedAccess support only the following interleaved-access group.
6657 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6658 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6659 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6660 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6661
6662 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6663 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6664 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6665 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6666 };
6667
6668 if (const auto *Entry =
6669 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6670 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6671 //If an entry does not exist, fallback to the default implementation.
6672
6673 // There is no strided stores meanwhile. And store can't be folded in
6674 // shuffle.
6675 unsigned NumOfSources = Factor; // The number of values to be merged.
6676 InstructionCost ShuffleCost = getShuffleCost(
6677 TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
6678 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6679
6680 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6681 // We need additional instructions to keep sources.
6682 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6684 MaskCost +
6685 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6686 NumOfMoves;
6687 return Cost;
6688}
6689
6691 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6692 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6693 bool UseMaskForCond, bool UseMaskForGaps) {
6694 auto *VecTy = cast<FixedVectorType>(BaseTy);
6695
6696 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6697 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6698 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6699 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6700 return true;
6701 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6702 return ST->hasBWI();
6703 if (EltTy->isBFloatTy())
6704 return ST->hasBF16();
6705 return false;
6706 };
6707 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6709 Opcode, VecTy, Factor, Indices, Alignment,
6710 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6711
6712 if (UseMaskForCond || UseMaskForGaps)
6713 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6714 Alignment, AddressSpace, CostKind,
6715 UseMaskForCond, UseMaskForGaps);
6716
6717 // Get estimation for interleaved load/store operations for SSE-AVX2.
6718 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6719 // computing the cost using a generic formula as a function of generic
6720 // shuffles. We therefore use a lookup table instead, filled according to
6721 // the instruction sequences that codegen currently generates.
6722
6723 // VecTy for interleave memop is <VF*Factor x Elt>.
6724 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6725 // VecTy = <12 x i32>.
6726 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6727
6728 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6729 // the VF=2, while v2i128 is an unsupported MVT vector type
6730 // (see MachineValueType.h::getVectorVT()).
6731 if (!LegalVT.isVector())
6732 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6733 Alignment, AddressSpace, CostKind);
6734
6735 unsigned VF = VecTy->getNumElements() / Factor;
6736 Type *ScalarTy = VecTy->getElementType();
6737 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6738 if (!ScalarTy->isIntegerTy())
6739 ScalarTy =
6740 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6741
6742 // Get the cost of all the memory operations.
6743 // FIXME: discount dead loads.
6744 InstructionCost MemOpCosts = getMemoryOpCost(
6745 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6746
6747 auto *VT = FixedVectorType::get(ScalarTy, VF);
6748 EVT ETy = TLI->getValueType(DL, VT);
6749 if (!ETy.isSimple())
6750 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6751 Alignment, AddressSpace, CostKind);
6752
6753 // TODO: Complete for other data-types and strides.
6754 // Each combination of Stride, element bit width and VF results in a different
6755 // sequence; The cost tables are therefore accessed with:
6756 // Factor (stride) and VectorType=VFxiN.
6757 // The Cost accounts only for the shuffle sequence;
6758 // The cost of the loads/stores is accounted for separately.
6759 //
6760 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6761 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6762 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6763 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6764 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6765 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6766
6767 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6768 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6769 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6770
6771 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6772 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6773 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6774
6775 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6776 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6777 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6778 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6779
6780 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6781 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6782 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6783 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6784 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6785
6786 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6787 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6788 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6789 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6790 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6791
6792 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6793 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6794 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6795 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6796 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6797
6798 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6799 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6800 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6801 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6802
6803 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6804 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6805 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6806 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6807 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6808
6809 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6810 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6811 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6812 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6813 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6814
6815 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6816 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6817 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6818 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6819 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6820
6821 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6822 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6823 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6824 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6825
6826 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6827 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6828 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6829 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6830 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6831
6832 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6833 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6834 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6835 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6836 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6837
6838 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6839 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6840 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6841 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6842
6843 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6844 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6845 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6846
6847 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6848 };
6849
6850 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6851 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6852 };
6853
6854 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6855 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6856 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6857
6858 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6859 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6860
6861 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6862 };
6863
6864 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6865 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6866 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6867
6868 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6869 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6870 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6871
6872 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6873 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6874 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6875 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6876
6877 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6878 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6879 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6880 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6881 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6882
6883 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6884 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6885 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6886 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6887 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6888
6889 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6890 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6891 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6892 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6893 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6894
6895 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6896 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6897 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6898 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6899 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6900
6901 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6902 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6903 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6904 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6905
6906 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6907 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6908 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6909 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6910 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6911
6912 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6913 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6914 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6915 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6916 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6917
6918 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6919 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6920 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6921 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6922 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6923
6924 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6925 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6926 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6927 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6928
6929 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6930 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6931 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6932 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6933 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6934
6935 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6936 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6937 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6938 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6939 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6940
6941 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6942 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6943 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6944 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6945
6946 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6947 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6948 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6949 };
6950
6951 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6952 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6953 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6954 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6955
6956 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6957 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6958
6959 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6960 };
6961
6962 if (Opcode == Instruction::Load) {
6963 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6964 MemOpCosts](const CostTblEntry *Entry) {
6965 // NOTE: this is just an approximation!
6966 // It can over/under -estimate the cost!
6967 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6968 };
6969
6970 if (ST->hasAVX2())
6971 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6972 ETy.getSimpleVT()))
6973 return GetDiscountedCost(Entry);
6974
6975 if (ST->hasSSSE3())
6976 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6977 ETy.getSimpleVT()))
6978 return GetDiscountedCost(Entry);
6979
6980 if (ST->hasSSE2())
6981 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6982 ETy.getSimpleVT()))
6983 return GetDiscountedCost(Entry);
6984 } else {
6985 assert(Opcode == Instruction::Store &&
6986 "Expected Store Instruction at this point");
6987 assert((!Indices.size() || Indices.size() == Factor) &&
6988 "Interleaved store only supports fully-interleaved groups.");
6989 if (ST->hasAVX2())
6990 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6991 ETy.getSimpleVT()))
6992 return MemOpCosts + Entry->Cost;
6993
6994 if (ST->hasSSE2())
6995 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6996 ETy.getSimpleVT()))
6997 return MemOpCosts + Entry->Cost;
6998 }
6999
7000 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7001 Alignment, AddressSpace, CostKind,
7002 UseMaskForCond, UseMaskForGaps);
7003}
7004
7006 StackOffset BaseOffset,
7007 bool HasBaseReg, int64_t Scale,
7008 unsigned AddrSpace) const {
7009 // Scaling factors are not free at all.
7010 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7011 // will take 2 allocations in the out of order engine instead of 1
7012 // for plain addressing mode, i.e. inst (reg1).
7013 // E.g.,
7014 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7015 // Requires two allocations (one for the load, one for the computation)
7016 // whereas:
7017 // vaddps (%rsi), %ymm0, %ymm1
7018 // Requires just 1 allocation, i.e., freeing allocations for other operations
7019 // and having less micro operations to execute.
7020 //
7021 // For some X86 architectures, this is even worse because for instance for
7022 // stores, the complex addressing mode forces the instruction to use the
7023 // "load" ports instead of the dedicated "store" port.
7024 // E.g., on Haswell:
7025 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7026 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7028 AM.BaseGV = BaseGV;
7029 AM.BaseOffs = BaseOffset.getFixed();
7030 AM.HasBaseReg = HasBaseReg;
7031 AM.Scale = Scale;
7032 AM.ScalableOffset = BaseOffset.getScalable();
7033 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7034 // Scale represents reg2 * scale, thus account for 1
7035 // as soon as we use a second register.
7036 return AM.Scale != 0;
7037 return -1;
7038}
7039
7041 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7042 return 14;
7043}
7044
7046 unsigned Bits = Ty->getScalarSizeInBits();
7047
7048 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7049 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7050 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7051 return false;
7052
7053 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7054 // shifts just as cheap as scalar ones.
7055 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7056 return false;
7057
7058 // AVX512BW has shifts such as vpsllvw.
7059 if (ST->hasBWI() && Bits == 16)
7060 return false;
7061
7062 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7063 // fully general vector.
7064 return true;
7065}
7066
7067unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7068 Type *ScalarValTy) const {
7069 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7070 return 4;
7071 }
7072 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7073}
7074
7076 SmallVectorImpl<Use *> &Ops) const {
7077 using namespace llvm::PatternMatch;
7078
7079 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7080 if (!VTy)
7081 return false;
7082
7083 if (I->getOpcode() == Instruction::Mul &&
7084 VTy->getElementType()->isIntegerTy(64)) {
7085 for (auto &Op : I->operands()) {
7086 // Make sure we are not already sinking this operand
7087 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7088 continue;
7089
7090 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7091 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7092 if (ST->hasSSE41() &&
7093 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7094 m_SpecificInt(32)))) {
7095 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7096 Ops.push_back(&Op);
7097 } else if (ST->hasSSE2() &&
7098 match(Op.get(),
7099 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7100 Ops.push_back(&Op);
7101 }
7102 }
7103
7104 return !Ops.empty();
7105 }
7106
7107 // A uniform shift amount in a vector shift or funnel shift may be much
7108 // cheaper than a generic variable vector shift, so make that pattern visible
7109 // to SDAG by sinking the shuffle instruction next to the shift.
7110 int ShiftAmountOpNum = -1;
7111 if (I->isShift())
7112 ShiftAmountOpNum = 1;
7113 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7114 if (II->getIntrinsicID() == Intrinsic::fshl ||
7115 II->getIntrinsicID() == Intrinsic::fshr)
7116 ShiftAmountOpNum = 2;
7117 }
7118
7119 if (ShiftAmountOpNum == -1)
7120 return false;
7121
7122 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7123 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7124 isVectorShiftByScalarCheap(I->getType())) {
7125 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7126 return true;
7127 }
7128
7129 return false;
7130}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:478
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:396
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:670
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:703
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:487
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:541
Type * getElementType() const
Definition: DerivedTypes.h:460
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isVectorShiftByScalarCheap(Type *Ty) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:557
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:396
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55